From 71d5a2540a98c81f5bcaeb48805e0e2881f530ef Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sun, 16 Apr 2017 16:01:22 +0000 Subject: Vendor import of llvm trunk r300422: https://llvm.org/svn/llvm-project/llvm/trunk@300422 --- .../AArch64/GlobalISel/arm64-callingconv-ios.ll | 28 + .../AArch64/GlobalISel/arm64-callingconv.ll | 38 + test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll | 107 +- .../AArch64/GlobalISel/arm64-instructionselect.mir | 2979 ------ .../GlobalISel/arm64-irtranslator-stackprotect.ll | 2 +- .../AArch64/GlobalISel/arm64-irtranslator.ll | 573 +- .../AArch64/GlobalISel/arm64-regbankselect.mir | 14 +- test/CodeGen/AArch64/GlobalISel/call-translator.ll | 26 +- test/CodeGen/AArch64/GlobalISel/debug-insts.ll | 68 + test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll | 48 + test/CodeGen/AArch64/GlobalISel/gisel-abort.ll | 2 +- .../AArch64/GlobalISel/gisel-commandline-option.ll | 48 + .../gisel-fail-intermediate-legalizer.ll | 8 + test/CodeGen/AArch64/GlobalISel/inline-asm.ll | 10 + .../AArch64/GlobalISel/irtranslator-bitcast.ll | 30 + .../AArch64/GlobalISel/irtranslator-exceptions.ll | 60 +- test/CodeGen/AArch64/GlobalISel/legalize-add.mir | 27 +- test/CodeGen/AArch64/GlobalISel/legalize-and.mir | 7 +- test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir | 2 +- .../AArch64/GlobalISel/legalize-combines.mir | 180 +- .../AArch64/GlobalISel/legalize-constant.mir | 4 +- test/CodeGen/AArch64/GlobalISel/legalize-div.mir | 2 +- .../AArch64/GlobalISel/legalize-exceptions.ll | 53 + test/CodeGen/AArch64/GlobalISel/legalize-ext.mir | 2 +- test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir | 2 +- test/CodeGen/AArch64/GlobalISel/legalize-fneg.mir | 48 + test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir | 201 + test/CodeGen/AArch64/GlobalISel/legalize-gep.mir | 2 +- .../AArch64/GlobalISel/legalize-inserts.mir | 141 + test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir | 206 + .../AArch64/GlobalISel/legalize-load-store.mir | 26 +- test/CodeGen/AArch64/GlobalISel/legalize-mul.mir | 27 +- .../GlobalISel/legalize-nonpowerof2eltsvec.mir | 29 + test/CodeGen/AArch64/GlobalISel/legalize-or.mir | 7 +- test/CodeGen/AArch64/GlobalISel/legalize-pow.mir | 38 + test/CodeGen/AArch64/GlobalISel/legalize-rem.mir | 13 +- test/CodeGen/AArch64/GlobalISel/legalize-shift.mir | 47 + .../CodeGen/AArch64/GlobalISel/legalize-simple.mir | 125 +- test/CodeGen/AArch64/GlobalISel/legalize-sub.mir | 7 +- test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir | 39 + test/CodeGen/AArch64/GlobalISel/legalize-xor.mir | 7 +- test/CodeGen/AArch64/GlobalISel/no-regclass.mir | 30 + .../AArch64/GlobalISel/regbankselect-dbg-value.mir | 45 + .../AArch64/GlobalISel/regbankselect-default.mir | 4 +- .../GlobalISel/regbankselect-reg_sequence.mir | 25 + test/CodeGen/AArch64/GlobalISel/select-binop.mir | 1042 +++ test/CodeGen/AArch64/GlobalISel/select-bitcast.mir | 212 + test/CodeGen/AArch64/GlobalISel/select-br.mir | 71 + test/CodeGen/AArch64/GlobalISel/select-cbz.mir | 108 + .../CodeGen/AArch64/GlobalISel/select-constant.mir | 77 + .../AArch64/GlobalISel/select-dbg-value.mir | 69 + .../CodeGen/AArch64/GlobalISel/select-fp-casts.mir | 478 + test/CodeGen/AArch64/GlobalISel/select-int-ext.mir | 274 + .../AArch64/GlobalISel/select-int-ptr-casts.mir | 150 + test/CodeGen/AArch64/GlobalISel/select-load.mir | 515 ++ test/CodeGen/AArch64/GlobalISel/select-muladd.mir | 50 + .../CodeGen/AArch64/GlobalISel/select-property.mir | 21 + test/CodeGen/AArch64/GlobalISel/select-store.mir | 463 + test/CodeGen/AArch64/GlobalISel/select-trunc.mir | 81 + test/CodeGen/AArch64/GlobalISel/select-xor.mir | 165 + test/CodeGen/AArch64/GlobalISel/select.mir | 311 + test/CodeGen/AArch64/GlobalISel/translate-gep.ll | 4 +- .../AArch64/GlobalISel/varargs-ios-translator.ll | 16 + test/CodeGen/AArch64/GlobalISel/vastart.ll | 13 + .../CodeGen/AArch64/aarch64-codegen-prepare-atp.ll | 68 + test/CodeGen/AArch64/aarch64-fold-lslfast.ll | 74 + test/CodeGen/AArch64/aarch64-gep-opt.ll | 8 +- test/CodeGen/AArch64/aarch64-named-reg-w18.ll | 14 + test/CodeGen/AArch64/aarch64-named-reg-x18.ll | 14 + test/CodeGen/AArch64/and-sink.ll | 90 + test/CodeGen/AArch64/argument-blocks.ll | 4 +- test/CodeGen/AArch64/arm64-abi-varargs.ll | 6 +- test/CodeGen/AArch64/arm64-abi.ll | 5 +- test/CodeGen/AArch64/arm64-addr-type-promotion.ll | 11 +- test/CodeGen/AArch64/arm64-addrmode.ll | 4 +- test/CodeGen/AArch64/arm64-atomic.ll | 22 +- test/CodeGen/AArch64/arm64-bitfield-extract.ll | 8 +- test/CodeGen/AArch64/arm64-blockaddress.ll | 6 +- test/CodeGen/AArch64/arm64-builtins-linux.ll | 4 + test/CodeGen/AArch64/arm64-code-model-large-abs.ll | 36 +- .../AArch64/arm64-codegen-prepare-extload.ll | 36 +- test/CodeGen/AArch64/arm64-const-addr.ll | 4 +- test/CodeGen/AArch64/arm64-crc32.ll | 1 + test/CodeGen/AArch64/arm64-elf-globals.ll | 4 + test/CodeGen/AArch64/arm64-extern-weak.ll | 18 +- .../CodeGen/AArch64/arm64-fast-isel-addr-offset.ll | 4 +- test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll | 91 +- test/CodeGen/AArch64/arm64-inline-asm.ll | 4 +- test/CodeGen/AArch64/arm64-memset-inline.ll | 4 +- test/CodeGen/AArch64/arm64-movi.ll | 22 +- test/CodeGen/AArch64/arm64-neon-copy.ll | 2 +- test/CodeGen/AArch64/arm64-neon-v8.1a.ll | 1 + test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll | 63 + test/CodeGen/AArch64/arm64-regress-opt-cmp.mir | 2 +- test/CodeGen/AArch64/arm64-shrink-wrapping.ll | 14 +- test/CodeGen/AArch64/arm64-spill-remarks.ll | 117 + test/CodeGen/AArch64/arm64-summary-remarks.ll | 15 + test/CodeGen/AArch64/arm64-variadic-aapcs.ll | 2 +- test/CodeGen/AArch64/bitfield-insert.ll | 12 +- test/CodeGen/AArch64/blockaddress.ll | 6 +- test/CodeGen/AArch64/br-cond-not-merge.ll | 94 + test/CodeGen/AArch64/branch-relax-cbz.ll | 13 +- test/CodeGen/AArch64/code-model-large-abs.ll | 30 +- .../AArch64/concat_vector-scalar-combine.ll | 6 +- test/CodeGen/AArch64/cpus.ll | 2 +- test/CodeGen/AArch64/dag-numsignbits.ll | 33 + test/CodeGen/AArch64/eliminate-trunc.ll | 4 +- test/CodeGen/AArch64/extern-weak.ll | 18 +- test/CodeGen/AArch64/fast-isel-tail-call.ll | 24 + test/CodeGen/AArch64/fast-isel-tbz.ll | 18 +- test/CodeGen/AArch64/fpimm.ll | 10 +- test/CodeGen/AArch64/jump-table.ll | 6 +- test/CodeGen/AArch64/large-consts.ll | 6 +- test/CodeGen/AArch64/ldst-opt-aa.mir | 30 + test/CodeGen/AArch64/ldst-opt.mir | 2 +- test/CodeGen/AArch64/literal_pools_float.ll | 12 +- test/CodeGen/AArch64/live-interval-analysis.mir | 22 + test/CodeGen/AArch64/load-combine-big-endian.ll | 584 ++ test/CodeGen/AArch64/load-combine.ll | 548 ++ test/CodeGen/AArch64/machine-combiner-madd.ll | 2 +- test/CodeGen/AArch64/machine-copy-remove.mir | 672 ++ test/CodeGen/AArch64/machine-outliner.ll | 43 + test/CodeGen/AArch64/mature-mc-support.ll | 2 +- test/CodeGen/AArch64/merge-store.ll | 3 +- test/CodeGen/AArch64/misched-fusion-aes.ll | 207 + test/CodeGen/AArch64/misched-fusion-lit.ll | 46 + test/CodeGen/AArch64/misched-fusion.ll | 12 +- test/CodeGen/AArch64/movimm-wzr.mir | 2 +- test/CodeGen/AArch64/movw-shift-encoding.ll | 8 +- test/CodeGen/AArch64/neon-fma-FMF.ll | 53 + test/CodeGen/AArch64/optimize-cond-branch.ll | 2 +- test/CodeGen/AArch64/pr27816.ll | 48 + test/CodeGen/AArch64/prefixdata.ll | 29 + test/CodeGen/AArch64/regcoal-physreg.mir | 51 +- test/CodeGen/AArch64/regress-tblgen-chains.ll | 2 +- test/CodeGen/AArch64/remat.ll | 2 +- test/CodeGen/AArch64/selectiondag-order.ll | 96 + test/CodeGen/AArch64/stack-protector-target.ll | 10 + test/CodeGen/AArch64/stack_guard_remat.ll | 14 +- test/CodeGen/AArch64/tail-dup-repeat-worklist.ll | 69 - test/CodeGen/AArch64/tailcall-string-rvo.ll | 47 + test/CodeGen/AArch64/tbz-tbnz.ll | 16 +- test/CodeGen/AArch64/thread-pointer.ll | 60 + test/CodeGen/AArch64/vector_merge_dep_check.ll | 3 +- test/CodeGen/AArch64/xray-tail-call-sled.ll | 69 + test/CodeGen/AMDGPU/32-bit-local-address-space.ll | 24 +- .../AMDGPU/GlobalISel/inst-select-load-flat.mir | 28 + .../AMDGPU/GlobalISel/inst-select-load-smrd.mir | 142 + .../AMDGPU/GlobalISel/inst-select-store-flat.mir | 29 + test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir | 69 + test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll | 11 + test/CodeGen/AMDGPU/GlobalISel/smrd.ll | 89 + test/CodeGen/AMDGPU/add-debug.ll | 2 +- test/CodeGen/AMDGPU/add.i16.ll | 18 +- test/CodeGen/AMDGPU/add.ll | 16 +- test/CodeGen/AMDGPU/add.v2i16.ll | 283 + test/CodeGen/AMDGPU/add_i128.ll | 8 +- test/CodeGen/AMDGPU/add_i64.ll | 12 +- test/CodeGen/AMDGPU/addrspacecast-captured.ll | 47 + test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll | 28 +- test/CodeGen/AMDGPU/addrspacecast.ll | 129 +- test/CodeGen/AMDGPU/amdgcn.bitcast.ll | 59 +- test/CodeGen/AMDGPU/amdgcn.private-memory.ll | 2 +- test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll | 41 - test/CodeGen/AMDGPU/amdgcn.sendmsg.ll | 161 - test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll | 9 + test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll | 16 +- .../AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll | 88 +- .../AMDGPU/amdgpu-shader-calling-convention.ll | 5 +- test/CodeGen/AMDGPU/amdgpu.private-memory.ll | 92 +- .../amdgpu.work-item-intrinsics.deprecated.ll | 30 +- test/CodeGen/AMDGPU/and-gcn.ll | 2 +- test/CodeGen/AMDGPU/and.ll | 80 +- .../CodeGen/AMDGPU/annotate-kernel-features-hsa.ll | 100 +- test/CodeGen/AMDGPU/annotate-kernel-features.ll | 78 +- test/CodeGen/AMDGPU/anonymous-gv.ll | 4 +- test/CodeGen/AMDGPU/any_extend_vector_inreg.ll | 58 + test/CodeGen/AMDGPU/anyext.ll | 4 +- test/CodeGen/AMDGPU/array-ptr-calc-i32.ll | 8 +- test/CodeGen/AMDGPU/array-ptr-calc-i64.ll | 2 +- test/CodeGen/AMDGPU/ashr.v2i16.ll | 161 + test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll | 10 +- test/CodeGen/AMDGPU/atomic_load_add.ll | 8 +- test/CodeGen/AMDGPU/atomic_load_sub.ll | 8 +- .../AMDGPU/attr-amdgpu-flat-work-group-size.ll | 8 +- test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll | 34 +- test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll | 2 +- test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll | 26 +- test/CodeGen/AMDGPU/attr-unparseable.ll | 16 +- test/CodeGen/AMDGPU/barrier-elimination.ll | 30 + test/CodeGen/AMDGPU/basic-branch.ll | 9 +- test/CodeGen/AMDGPU/basic-loop.ll | 2 +- test/CodeGen/AMDGPU/bfe-patterns.ll | 163 + test/CodeGen/AMDGPU/bfe_uint.ll | 4 +- test/CodeGen/AMDGPU/bfi_int.ll | 6 +- test/CodeGen/AMDGPU/bfm.ll | 4 +- test/CodeGen/AMDGPU/big_alu.ll | 110 +- test/CodeGen/AMDGPU/bitcast-vector-extract.ll | 32 +- .../CodeGen/AMDGPU/bitreverse-inline-immediates.ll | 54 +- test/CodeGen/AMDGPU/bitreverse.ll | 20 +- test/CodeGen/AMDGPU/br_cc.f16.ll | 48 +- test/CodeGen/AMDGPU/branch-condition-and.ll | 17 +- test/CodeGen/AMDGPU/branch-relax-spill.ll | 2 +- test/CodeGen/AMDGPU/branch-relaxation.ll | 39 +- test/CodeGen/AMDGPU/bswap.ll | 14 +- test/CodeGen/AMDGPU/build_vector.ll | 4 +- test/CodeGen/AMDGPU/call.ll | 6 +- test/CodeGen/AMDGPU/calling-conventions.ll | 43 +- test/CodeGen/AMDGPU/captured-frame-index.ll | 78 +- test/CodeGen/AMDGPU/cf-loop-on-constant.ll | 12 +- test/CodeGen/AMDGPU/cf-stack-bug.ll | 8 +- test/CodeGen/AMDGPU/cf_end.ll | 2 +- test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll | 18 +- test/CodeGen/AMDGPU/cgp-addressing-modes.ll | 222 +- test/CodeGen/AMDGPU/cgp-bitfield-extract.ll | 19 +- test/CodeGen/AMDGPU/clamp-modifier.ll | 222 + test/CodeGen/AMDGPU/clamp-omod-special-case.mir | 424 + test/CodeGen/AMDGPU/clamp.ll | 529 ++ test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll | 4 +- test/CodeGen/AMDGPU/coalescer-subrange-crash.ll | 32 +- test/CodeGen/AMDGPU/coalescer_remat.ll | 2 +- .../AMDGPU/code-object-metadata-deduce-ro-arg.ll | 33 + .../code-object-metadata-from-llvm-ir-full.ll | 1260 +++ .../code-object-metadata-invalid-ocl-version-1.ll | 9 + .../code-object-metadata-invalid-ocl-version-2.ll | 10 + .../code-object-metadata-invalid-ocl-version-3.ll | 10 + .../code-object-metadata-kernel-code-props.ll | 32 + .../code-object-metadata-kernel-debug-props.ll | 67 + .../AMDGPU/codegen-prepare-addrmode-sext.ll | 2 +- test/CodeGen/AMDGPU/combine_vloads.ll | 2 +- test/CodeGen/AMDGPU/commute-compares.ll | 106 +- test/CodeGen/AMDGPU/commute-shifts.ll | 17 +- test/CodeGen/AMDGPU/commute_modifiers.ll | 20 +- test/CodeGen/AMDGPU/concat_vectors.ll | 64 +- test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir | 16 +- test/CodeGen/AMDGPU/constant-fold-mi-operands.ll | 20 +- test/CodeGen/AMDGPU/control-flow-fastregalloc.ll | 37 +- test/CodeGen/AMDGPU/convergent-inlineasm.ll | 5 +- test/CodeGen/AMDGPU/copy-illegal-type.ll | 22 +- test/CodeGen/AMDGPU/copy-to-reg.ll | 2 +- test/CodeGen/AMDGPU/ctlz.ll | 35 +- test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 34 +- test/CodeGen/AMDGPU/ctpop.ll | 30 +- test/CodeGen/AMDGPU/ctpop64.ll | 22 +- test/CodeGen/AMDGPU/cttz_zero_undef.ll | 8 +- test/CodeGen/AMDGPU/cube.ll | 18 +- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 32 +- test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll | 12 +- test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll | 10 +- test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll | 2 +- .../dagcombiner-bug-illegal-vec4-int-to-fp.ll | 4 +- test/CodeGen/AMDGPU/debug.ll | 2 +- test/CodeGen/AMDGPU/debugger-emit-prologue.ll | 2 +- test/CodeGen/AMDGPU/debugger-insert-nops.ll | 26 +- test/CodeGen/AMDGPU/debugger-reserve-regs.ll | 3 +- test/CodeGen/AMDGPU/default-fp-mode.ll | 52 +- test/CodeGen/AMDGPU/detect-dead-lanes.mir | 18 +- .../AMDGPU/disconnected-predset-break-bug.ll | 2 +- test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll | 2 +- test/CodeGen/AMDGPU/ds-combine-large-stride.ll | 412 + .../ds-negative-offset-addressing-mode-loop.ll | 2 +- test/CodeGen/AMDGPU/ds-sub-offset.ll | 14 +- test/CodeGen/AMDGPU/ds_read2.ll | 48 +- test/CodeGen/AMDGPU/ds_read2_offset_order.ll | 2 +- test/CodeGen/AMDGPU/ds_read2_superreg.ll | 24 +- test/CodeGen/AMDGPU/ds_read2st64.ll | 30 +- test/CodeGen/AMDGPU/ds_write2.ll | 42 +- test/CodeGen/AMDGPU/ds_write2st64.ll | 10 +- test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 2 +- test/CodeGen/AMDGPU/early-if-convert-cost.ll | 110 + test/CodeGen/AMDGPU/early-if-convert.ll | 454 + test/CodeGen/AMDGPU/early-inline-alias.ll | 12 + test/CodeGen/AMDGPU/early-inline.ll | 25 + test/CodeGen/AMDGPU/elf.ll | 8 +- test/CodeGen/AMDGPU/elf.r600.ll | 2 +- test/CodeGen/AMDGPU/else.ll | 18 +- test/CodeGen/AMDGPU/empty-function.ll | 4 +- .../AMDGPU/enable-no-signed-zeros-fp-math.ll | 22 + test/CodeGen/AMDGPU/endcf-loop-header.ll | 2 +- test/CodeGen/AMDGPU/env-amdgiz.ll | 11 + test/CodeGen/AMDGPU/env-amdgizcl.ll | 11 + test/CodeGen/AMDGPU/exceed-max-sgprs.ll | 10 +- test/CodeGen/AMDGPU/extend-bit-ops-i16.ll | 6 +- test/CodeGen/AMDGPU/extload-align.ll | 2 +- test/CodeGen/AMDGPU/extload-private.ll | 16 +- test/CodeGen/AMDGPU/extload.ll | 8 +- .../extract-vector-elt-build-vector-combine.ll | 6 +- test/CodeGen/AMDGPU/extract_vector_elt-f16.ll | 128 + test/CodeGen/AMDGPU/extract_vector_elt-f64.ll | 6 +- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll | 125 +- test/CodeGen/AMDGPU/extract_vector_elt-i64.ll | 12 +- test/CodeGen/AMDGPU/extract_vector_elt-i8.ll | 20 +- test/CodeGen/AMDGPU/extractelt-to-trunc.ll | 12 +- test/CodeGen/AMDGPU/fabs.f16.ll | 123 +- test/CodeGen/AMDGPU/fabs.f64.ll | 16 +- test/CodeGen/AMDGPU/fabs.ll | 14 +- test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll | 16 +- test/CodeGen/AMDGPU/fadd.f16.ll | 88 +- test/CodeGen/AMDGPU/fadd.ll | 19 +- test/CodeGen/AMDGPU/fadd64.ll | 8 +- test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 321 +- test/CodeGen/AMDGPU/fcanonicalize.ll | 146 +- test/CodeGen/AMDGPU/fceil.ll | 12 +- test/CodeGen/AMDGPU/fceil64.ll | 12 +- test/CodeGen/AMDGPU/fcmp-cnd.ll | 2 +- test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll | 2 +- test/CodeGen/AMDGPU/fcmp.f16.ll | 64 +- test/CodeGen/AMDGPU/fcmp.ll | 4 +- test/CodeGen/AMDGPU/fcmp64.ll | 12 +- test/CodeGen/AMDGPU/fconst64.ll | 2 +- test/CodeGen/AMDGPU/fcopysign.f16.ll | 264 + test/CodeGen/AMDGPU/fcopysign.f32.ll | 6 +- test/CodeGen/AMDGPU/fcopysign.f64.ll | 8 +- test/CodeGen/AMDGPU/fdiv.f16.ll | 54 +- test/CodeGen/AMDGPU/fdiv.f64.ll | 101 +- test/CodeGen/AMDGPU/fdiv.ll | 28 +- test/CodeGen/AMDGPU/ffloor.f64.ll | 16 +- test/CodeGen/AMDGPU/ffloor.ll | 6 +- test/CodeGen/AMDGPU/fix-vgpr-copies.mir | 44 + test/CodeGen/AMDGPU/flat-address-space.ll | 56 +- .../AMDGPU/flat-for-global-subtarget-feature.ll | 4 +- test/CodeGen/AMDGPU/flat-scratch-reg.ll | 8 +- test/CodeGen/AMDGPU/flat_atomics.ll | 194 +- test/CodeGen/AMDGPU/flat_atomics_i64.ll | 194 +- test/CodeGen/AMDGPU/fma-combine.ll | 52 +- test/CodeGen/AMDGPU/fma.f64.ll | 6 +- test/CodeGen/AMDGPU/fma.ll | 10 +- test/CodeGen/AMDGPU/fmax3.f64.ll | 2 +- test/CodeGen/AMDGPU/fmax3.ll | 4 +- test/CodeGen/AMDGPU/fmax_legacy.f64.ll | 8 +- test/CodeGen/AMDGPU/fmax_legacy.ll | 14 +- test/CodeGen/AMDGPU/fmaxnum.f64.ll | 10 +- test/CodeGen/AMDGPU/fmaxnum.ll | 34 +- test/CodeGen/AMDGPU/fmed3.ll | 851 +- test/CodeGen/AMDGPU/fmin3.ll | 4 +- test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll | 47 + test/CodeGen/AMDGPU/fmin_legacy.f64.ll | 10 +- test/CodeGen/AMDGPU/fmin_legacy.ll | 20 +- test/CodeGen/AMDGPU/fminnum.f64.ll | 10 +- test/CodeGen/AMDGPU/fminnum.ll | 34 +- test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll | 55 +- test/CodeGen/AMDGPU/fmul.f16.ll | 76 +- test/CodeGen/AMDGPU/fmul.ll | 16 +- test/CodeGen/AMDGPU/fmul64.ll | 6 +- test/CodeGen/AMDGPU/fmuladd.f16.ll | 50 +- test/CodeGen/AMDGPU/fmuladd.f32.ll | 36 +- test/CodeGen/AMDGPU/fmuladd.f64.ll | 16 +- test/CodeGen/AMDGPU/fmuladd.v2f16.ll | 107 + test/CodeGen/AMDGPU/fnearbyint.ll | 12 +- test/CodeGen/AMDGPU/fneg-combines.ll | 951 +- test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 156 +- test/CodeGen/AMDGPU/fneg-fabs.f64.ll | 16 +- test/CodeGen/AMDGPU/fneg-fabs.ll | 16 +- test/CodeGen/AMDGPU/fneg.f16.ll | 120 +- test/CodeGen/AMDGPU/fneg.f64.ll | 10 +- test/CodeGen/AMDGPU/fneg.ll | 12 +- test/CodeGen/AMDGPU/fold-cndmask.mir | 34 + test/CodeGen/AMDGPU/fold-immediate-output-mods.mir | 306 + test/CodeGen/AMDGPU/fp-classify.ll | 18 +- test/CodeGen/AMDGPU/fp16_to_fp32.ll | 2 +- test/CodeGen/AMDGPU/fp16_to_fp64.ll | 2 +- test/CodeGen/AMDGPU/fp32_to_fp16.ll | 2 +- test/CodeGen/AMDGPU/fp_to_sint.f64.ll | 12 +- test/CodeGen/AMDGPU/fp_to_sint.ll | 20 +- test/CodeGen/AMDGPU/fp_to_uint.f64.ll | 16 +- test/CodeGen/AMDGPU/fp_to_uint.ll | 18 +- test/CodeGen/AMDGPU/fpext.f16.ll | 243 +- test/CodeGen/AMDGPU/fpext.ll | 10 +- test/CodeGen/AMDGPU/fptosi.f16.ll | 43 +- test/CodeGen/AMDGPU/fptoui.f16.ll | 37 +- test/CodeGen/AMDGPU/fptrunc.f16.ll | 163 +- test/CodeGen/AMDGPU/fptrunc.ll | 10 +- test/CodeGen/AMDGPU/fract.f64.ll | 8 +- test/CodeGen/AMDGPU/fract.ll | 8 +- test/CodeGen/AMDGPU/frem.ll | 25 +- test/CodeGen/AMDGPU/fsqrt.f64.ll | 4 +- test/CodeGen/AMDGPU/fsqrt.ll | 20 +- test/CodeGen/AMDGPU/fsub.f16.ll | 141 +- test/CodeGen/AMDGPU/fsub.ll | 81 +- test/CodeGen/AMDGPU/fsub64.ll | 20 +- test/CodeGen/AMDGPU/ftrunc.f64.ll | 14 +- test/CodeGen/AMDGPU/ftrunc.ll | 12 +- test/CodeGen/AMDGPU/gep-address-space.ll | 8 +- test/CodeGen/AMDGPU/global-constant.ll | 4 +- test/CodeGen/AMDGPU/global-directive.ll | 2 +- test/CodeGen/AMDGPU/global-extload-i16.ll | 64 +- test/CodeGen/AMDGPU/global-variable-relocs.ll | 22 +- test/CodeGen/AMDGPU/global_atomics.ll | 196 +- test/CodeGen/AMDGPU/global_atomics_i64.ll | 194 +- test/CodeGen/AMDGPU/gv-const-addrspace.ll | 10 +- test/CodeGen/AMDGPU/gv-offset-folding.ll | 4 +- test/CodeGen/AMDGPU/half.ll | 221 +- test/CodeGen/AMDGPU/hsa-default-device.ll | 2 +- test/CodeGen/AMDGPU/hsa-fp-mode.ll | 31 +- test/CodeGen/AMDGPU/hsa-func.ll | 2 +- test/CodeGen/AMDGPU/hsa-globals.ll | 2 +- test/CodeGen/AMDGPU/hsa-group-segment.ll | 2 +- test/CodeGen/AMDGPU/hsa-note-no-func.ll | 4 + test/CodeGen/AMDGPU/hsa.ll | 2 + test/CodeGen/AMDGPU/i1-copy-implicit-def.ll | 2 +- test/CodeGen/AMDGPU/i1-copy-phi.ll | 2 +- test/CodeGen/AMDGPU/i8-to-double-to-float.ll | 2 +- .../AMDGPU/icmp-select-sete-reverse-args.ll | 2 +- test/CodeGen/AMDGPU/icmp.i16.ll | 40 +- test/CodeGen/AMDGPU/icmp64.ll | 20 +- test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll | 45 + test/CodeGen/AMDGPU/image-attributes.ll | 20 +- test/CodeGen/AMDGPU/image-resource-id.ll | 40 +- test/CodeGen/AMDGPU/imm.ll | 151 +- test/CodeGen/AMDGPU/imm16.ll | 66 +- test/CodeGen/AMDGPU/immv216.ll | 446 + .../CodeGen/AMDGPU/indirect-addressing-si-noopt.ll | 2 +- test/CodeGen/AMDGPU/indirect-addressing-si.ll | 49 +- test/CodeGen/AMDGPU/indirect-private-64.ll | 22 +- test/CodeGen/AMDGPU/infinite-loop-evergreen.ll | 2 +- test/CodeGen/AMDGPU/infinite-loop.ll | 2 +- test/CodeGen/AMDGPU/inline-asm.ll | 81 +- test/CodeGen/AMDGPU/inline-calls.ll | 8 +- test/CodeGen/AMDGPU/inline-constraints.ll | 12 +- test/CodeGen/AMDGPU/inlineasm-16.ll | 8 +- test/CodeGen/AMDGPU/inlineasm-illegal-type.ll | 20 +- test/CodeGen/AMDGPU/inlineasm-packed.ll | 57 + test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir | 40 + test/CodeGen/AMDGPU/insert-waits-callee.mir | 25 + test/CodeGen/AMDGPU/insert-waits-exp.mir | 12 +- test/CodeGen/AMDGPU/insert_subreg.ll | 2 +- test/CodeGen/AMDGPU/insert_vector_elt.ll | 156 +- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 470 + test/CodeGen/AMDGPU/inserted-wait-states.mir | 226 +- test/CodeGen/AMDGPU/internalize.ll | 35 + test/CodeGen/AMDGPU/invalid-addrspacecast.ll | 2 +- .../AMDGPU/invalid-opencl-version-metadata1.ll | 6 - .../AMDGPU/invalid-opencl-version-metadata2.ll | 7 - .../AMDGPU/invalid-opencl-version-metadata3.ll | 7 - .../AMDGPU/invariant-load-no-alias-store.ll | 4 +- test/CodeGen/AMDGPU/invert-br-undef-vcc.mir | 2 +- test/CodeGen/AMDGPU/kcache-fold.ll | 186 +- test/CodeGen/AMDGPU/kernarg-stack-alignment.ll | 20 +- test/CodeGen/AMDGPU/kernel-args.ll | 72 +- test/CodeGen/AMDGPU/large-alloca-compute.ll | 4 +- test/CodeGen/AMDGPU/large-alloca-graphics.ll | 3 + test/CodeGen/AMDGPU/large-constant-initializer.ll | 2 +- .../AMDGPU/large-work-group-promote-alloca.ll | 72 +- test/CodeGen/AMDGPU/lds-alignment.ll | 28 +- test/CodeGen/AMDGPU/lds-initializer.ll | 2 +- test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll | 2 +- test/CodeGen/AMDGPU/lds-oqap-crash.ll | 2 +- test/CodeGen/AMDGPU/lds-output-queue.ll | 4 +- test/CodeGen/AMDGPU/lds-size.ll | 2 +- test/CodeGen/AMDGPU/lds-zero-initializer.ll | 2 +- .../CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll | 2 +- test/CodeGen/AMDGPU/limit-coalesce.mir | 71 + test/CodeGen/AMDGPU/literals.ll | 8 +- test/CodeGen/AMDGPU/liveness.mir | 2 +- test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll | 437 - test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll | 631 -- test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll | 56 - test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll | 57 - test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll | 29 +- test/CodeGen/AMDGPU/llvm.SI.export.ll | 237 - test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll | 59 - test/CodeGen/AMDGPU/llvm.SI.gather4.ll | 525 -- test/CodeGen/AMDGPU/llvm.SI.getlod.ll | 44 - test/CodeGen/AMDGPU/llvm.SI.image.ll | 49 - test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll | 94 - test/CodeGen/AMDGPU/llvm.SI.image.sample.ll | 309 - test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll | 309 - test/CodeGen/AMDGPU/llvm.SI.load.dword.ll | 7 +- test/CodeGen/AMDGPU/llvm.SI.packf16.ll | 28 - test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll | 188 +- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll | 177 +- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll | 2 +- .../AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll | 2 +- .../AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll | 20 +- test/CodeGen/AMDGPU/llvm.amdgcn.class.ll | 60 +- test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 166 + test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll | 14 +- test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll | 20 +- test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll | 40 +- test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll | 9 +- test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll | 162 + test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll | 484 + test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll | 62 +- test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll | 39 + test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll | 28 + test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll | 10 +- test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll | 9 +- test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll | 12 +- test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll | 12 +- test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll | 51 +- test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll | 74 +- test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll | 16 +- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll | 172 +- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll | 252 +- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll | 259 +- test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll | 91 +- .../AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll | 24 +- test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll | 10 +- test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll | 26 +- test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll | 8 +- test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll | 42 +- test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 8 +- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll | 8 +- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll | 14 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 11 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll | 4 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll | 556 ++ test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll | 127 + test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll | 29 +- test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 623 ++ test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll | 6 +- test/CodeGen/AMDGPU/llvm.ceil.f16.ll | 24 +- test/CodeGen/AMDGPU/llvm.cos.f16.ll | 46 +- test/CodeGen/AMDGPU/llvm.cos.ll | 4 +- test/CodeGen/AMDGPU/llvm.dbg.value.ll | 2 +- test/CodeGen/AMDGPU/llvm.exp2.f16.ll | 20 +- test/CodeGen/AMDGPU/llvm.exp2.ll | 8 +- test/CodeGen/AMDGPU/llvm.floor.f16.ll | 20 +- test/CodeGen/AMDGPU/llvm.fma.f16.ll | 177 +- test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 63 +- test/CodeGen/AMDGPU/llvm.log2.f16.ll | 32 +- test/CodeGen/AMDGPU/llvm.log2.ll | 6 +- test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 90 +- test/CodeGen/AMDGPU/llvm.memcpy.ll | 22 +- test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 98 +- test/CodeGen/AMDGPU/llvm.r600.cube.ll | 57 + test/CodeGen/AMDGPU/llvm.r600.dot4.ll | 2 +- test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll | 2 +- test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll | 20 +- test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll | 2 +- test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll | 6 +- test/CodeGen/AMDGPU/llvm.r600.tex.ll | 2 +- test/CodeGen/AMDGPU/llvm.rint.f16.ll | 33 +- test/CodeGen/AMDGPU/llvm.rint.f64.ll | 6 +- test/CodeGen/AMDGPU/llvm.rint.ll | 6 +- test/CodeGen/AMDGPU/llvm.round.f64.ll | 10 +- test/CodeGen/AMDGPU/llvm.round.ll | 81 +- test/CodeGen/AMDGPU/llvm.sin.f16.ll | 46 +- test/CodeGen/AMDGPU/llvm.sin.ll | 16 +- test/CodeGen/AMDGPU/llvm.sqrt.f16.ll | 20 +- test/CodeGen/AMDGPU/llvm.trunc.f16.ll | 20 +- test/CodeGen/AMDGPU/load-constant-f64.ll | 2 +- test/CodeGen/AMDGPU/load-constant-i1.ll | 88 +- test/CodeGen/AMDGPU/load-constant-i16.ll | 80 +- test/CodeGen/AMDGPU/load-constant-i32.ll | 40 +- test/CodeGen/AMDGPU/load-constant-i64.ll | 12 +- test/CodeGen/AMDGPU/load-constant-i8.ll | 112 +- test/CodeGen/AMDGPU/load-global-f32.ll | 12 +- test/CodeGen/AMDGPU/load-global-f64.ll | 12 +- test/CodeGen/AMDGPU/load-global-i1.ll | 88 +- test/CodeGen/AMDGPU/load-global-i16.ll | 80 +- test/CodeGen/AMDGPU/load-global-i32.ll | 40 +- test/CodeGen/AMDGPU/load-global-i64.ll | 12 +- test/CodeGen/AMDGPU/load-global-i8.ll | 112 +- test/CodeGen/AMDGPU/load-input-fold.ll | 9 - test/CodeGen/AMDGPU/load-local-f32.ll | 12 +- test/CodeGen/AMDGPU/load-local-f64.ll | 12 +- test/CodeGen/AMDGPU/load-local-i1.ll | 88 +- test/CodeGen/AMDGPU/load-local-i16.ll | 80 +- test/CodeGen/AMDGPU/load-local-i32.ll | 40 +- test/CodeGen/AMDGPU/load-local-i64.ll | 12 +- test/CodeGen/AMDGPU/load-local-i8.ll | 112 +- test/CodeGen/AMDGPU/load-weird-sizes.ll | 4 +- test/CodeGen/AMDGPU/local-64.ll | 32 +- test/CodeGen/AMDGPU/local-atomics.ll | 108 +- test/CodeGen/AMDGPU/local-atomics64.ll | 100 +- test/CodeGen/AMDGPU/local-memory.amdgcn.ll | 10 +- test/CodeGen/AMDGPU/local-memory.ll | 4 +- test/CodeGen/AMDGPU/local-memory.r600.ll | 4 +- test/CodeGen/AMDGPU/local-stack-slot-bug.ll | 7 +- test/CodeGen/AMDGPU/loop-address.ll | 2 +- test/CodeGen/AMDGPU/loop-idiom.ll | 4 +- test/CodeGen/AMDGPU/loop_break.ll | 265 +- test/CodeGen/AMDGPU/lower-mem-intrinsics.ll | 117 + .../AMDGPU/lower-range-metadata-intrinsic-call.ll | 14 +- test/CodeGen/AMDGPU/lshl.ll | 15 - test/CodeGen/AMDGPU/lshr.ll | 15 - test/CodeGen/AMDGPU/lshr.v2i16.ll | 149 + test/CodeGen/AMDGPU/mad-combine.ll | 28 +- test/CodeGen/AMDGPU/mad24-get-global-id.ll | 2 +- test/CodeGen/AMDGPU/mad_int24.ll | 2 +- test/CodeGen/AMDGPU/mad_uint24.ll | 8 +- test/CodeGen/AMDGPU/madak.ll | 20 +- test/CodeGen/AMDGPU/madmk.ll | 20 +- test/CodeGen/AMDGPU/max.i16.ll | 90 +- test/CodeGen/AMDGPU/max.ll | 46 +- test/CodeGen/AMDGPU/max3.ll | 4 +- test/CodeGen/AMDGPU/mem-builtins.ll | 12 +- test/CodeGen/AMDGPU/merge-stores.ll | 100 +- test/CodeGen/AMDGPU/min.ll | 303 +- test/CodeGen/AMDGPU/min3.ll | 8 +- test/CodeGen/AMDGPU/missing-store.ll | 2 +- .../AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll | 2 +- test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll | 4 +- test/CodeGen/AMDGPU/mubuf.ll | 26 +- test/CodeGen/AMDGPU/mul.ll | 62 +- test/CodeGen/AMDGPU/mul_int24.ll | 14 +- test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 26 +- test/CodeGen/AMDGPU/mul_uint24-r600.ll | 12 +- test/CodeGen/AMDGPU/mulhu.ll | 17 - test/CodeGen/AMDGPU/multi-divergent-exit-region.ll | 710 ++ test/CodeGen/AMDGPU/multilevel-break.ll | 4 +- test/CodeGen/AMDGPU/nested-loop-conditions.ll | 269 + .../AMDGPU/no-initializer-constant-addrspace.ll | 4 +- test/CodeGen/AMDGPU/no-shrink-extloads.ll | 36 +- test/CodeGen/AMDGPU/nop-data.ll | 87 + test/CodeGen/AMDGPU/nullptr.ll | 113 + test/CodeGen/AMDGPU/omod.ll | 297 + test/CodeGen/AMDGPU/opencl-image-metadata.ll | 2 +- test/CodeGen/AMDGPU/operand-folding.ll | 14 +- test/CodeGen/AMDGPU/operand-spacing.ll | 2 +- test/CodeGen/AMDGPU/optimize-if-exec-masking.mir | 20 +- test/CodeGen/AMDGPU/or.ll | 44 +- test/CodeGen/AMDGPU/over-max-lds-size.ll | 2 +- test/CodeGen/AMDGPU/pack.v2f16.ll | 219 + test/CodeGen/AMDGPU/pack.v2i16.ll | 181 + test/CodeGen/AMDGPU/packetizer.ll | 2 +- test/CodeGen/AMDGPU/parallelandifcollapse.ll | 2 +- test/CodeGen/AMDGPU/parallelorifcollapse.ll | 2 +- test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll | 638 ++ .../partially-dead-super-register-immediate.ll | 2 +- test/CodeGen/AMDGPU/predicates.ll | 8 +- test/CodeGen/AMDGPU/private-access-no-objects.ll | 8 +- test/CodeGen/AMDGPU/private-element-size.ll | 102 +- test/CodeGen/AMDGPU/private-memory-atomics.ll | 4 +- test/CodeGen/AMDGPU/private-memory-broken.ll | 2 +- test/CodeGen/AMDGPU/private-memory-r600.ll | 37 +- .../AMDGPU/promote-alloca-array-allocation.ll | 4 +- .../AMDGPU/promote-alloca-bitcast-function.ll | 4 +- test/CodeGen/AMDGPU/promote-alloca-globals.ll | 4 +- .../AMDGPU/promote-alloca-invariant-markers.ll | 2 +- test/CodeGen/AMDGPU/promote-alloca-lifetime.ll | 10 +- .../AMDGPU/promote-alloca-mem-intrinsics.ll | 14 +- test/CodeGen/AMDGPU/promote-alloca-no-opts.ll | 4 +- .../AMDGPU/promote-alloca-padding-size-estimate.ll | 6 +- .../AMDGPU/promote-alloca-stored-pointer-value.ll | 10 +- test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll | 8 +- test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll | 14 +- .../CodeGen/AMDGPU/promote-alloca-to-lds-select.ll | 16 +- .../AMDGPU/promote-alloca-unhandled-intrinsic.ll | 2 +- test/CodeGen/AMDGPU/promote-alloca-volatile.ll | 10 +- test/CodeGen/AMDGPU/pv.ll | 458 +- ...-infinite-loop-bug-while-reorganizing-vector.ll | 4 +- test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll | 2 +- test/CodeGen/AMDGPU/r600.alu-limits.ll | 29 + test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll | 7 + test/CodeGen/AMDGPU/r600.bitcast.ll | 16 +- test/CodeGen/AMDGPU/r600.global_atomics.ll | 542 ++ test/CodeGen/AMDGPU/r600.private-memory.ll | 2 +- test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll | 16 +- test/CodeGen/AMDGPU/rcp-pattern.ll | 47 +- .../AMDGPU/read-register-invalid-subtarget.ll | 2 +- .../AMDGPU/read-register-invalid-type-i32.ll | 2 +- .../AMDGPU/read-register-invalid-type-i64.ll | 2 +- test/CodeGen/AMDGPU/read_register.ll | 14 +- test/CodeGen/AMDGPU/readcyclecounter.ll | 2 +- test/CodeGen/AMDGPU/reduce-load-width-alignment.ll | 6 +- .../CodeGen/AMDGPU/reduce-store-width-alignment.ll | 10 +- test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll | 2 +- test/CodeGen/AMDGPU/regcoalesce-dbg.mir | 76 + test/CodeGen/AMDGPU/register-count-comments.ll | 4 +- test/CodeGen/AMDGPU/rename-disconnected-bug.ll | 2 +- test/CodeGen/AMDGPU/rename-independent-subregs.mir | 4 +- test/CodeGen/AMDGPU/reorder-stores.ll | 8 +- test/CodeGen/AMDGPU/ret.ll | 207 +- test/CodeGen/AMDGPU/ret_jump.ll | 108 +- test/CodeGen/AMDGPU/rotl.i64.ll | 4 +- test/CodeGen/AMDGPU/rotl.ll | 6 +- test/CodeGen/AMDGPU/rotr.i64.ll | 8 +- test/CodeGen/AMDGPU/rotr.ll | 6 +- test/CodeGen/AMDGPU/rsq.ll | 16 +- test/CodeGen/AMDGPU/runtime-metadata.ll | 396 - test/CodeGen/AMDGPU/s_addk_i32.ll | 29 +- test/CodeGen/AMDGPU/s_movk_i32.ll | 26 +- test/CodeGen/AMDGPU/s_mulk_i32.ll | 10 +- test/CodeGen/AMDGPU/sad.ll | 34 +- test/CodeGen/AMDGPU/saddo.ll | 10 +- test/CodeGen/AMDGPU/salu-to-valu.ll | 40 +- test/CodeGen/AMDGPU/sampler-resource-id.ll | 6 +- test/CodeGen/AMDGPU/scalar-store-cache-flush.mir | 14 +- test/CodeGen/AMDGPU/scalar_to_vector.ll | 51 +- test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll | 130 +- test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll | 137 +- test/CodeGen/AMDGPU/schedule-fs-loop.ll | 121 +- test/CodeGen/AMDGPU/schedule-global-loads.ll | 4 +- test/CodeGen/AMDGPU/schedule-if-2.ll | 2 +- test/CodeGen/AMDGPU/schedule-if.ll | 2 +- test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll | 4 +- test/CodeGen/AMDGPU/schedule-regpressure-limit.ll | 591 ++ test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll | 288 + test/CodeGen/AMDGPU/schedule-regpressure.mir | 57 + test/CodeGen/AMDGPU/scratch-buffer.ll | 16 +- test/CodeGen/AMDGPU/sdiv.ll | 30 +- test/CodeGen/AMDGPU/sdivrem24.ll | 34 +- test/CodeGen/AMDGPU/sdivrem64.ll | 12 +- test/CodeGen/AMDGPU/sdwa-peephole.ll | 395 + .../AMDGPU/select-fabs-fneg-extract-legacy.ll | 4 +- test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll | 92 +- test/CodeGen/AMDGPU/select-i1.ll | 4 +- test/CodeGen/AMDGPU/select-opt.ll | 18 +- test/CodeGen/AMDGPU/select-vectors.ll | 26 +- test/CodeGen/AMDGPU/select.f16.ll | 118 +- test/CodeGen/AMDGPU/select.ll | 2 +- test/CodeGen/AMDGPU/select64.ll | 10 +- test/CodeGen/AMDGPU/selectcc-cnd.ll | 2 +- test/CodeGen/AMDGPU/selectcc-cnde-int.ll | 2 +- test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll | 2 +- test/CodeGen/AMDGPU/selectcc-opt.ll | 8 +- test/CodeGen/AMDGPU/selectcc.ll | 2 +- test/CodeGen/AMDGPU/selected-stack-object.ll | 2 +- test/CodeGen/AMDGPU/set-dx10.ll | 24 +- test/CodeGen/AMDGPU/setcc-equivalent.ll | 4 +- test/CodeGen/AMDGPU/setcc-fneg-constant.ll | 258 + test/CodeGen/AMDGPU/setcc-opt.ll | 40 +- test/CodeGen/AMDGPU/setcc.ll | 60 +- test/CodeGen/AMDGPU/setcc64.ll | 48 +- test/CodeGen/AMDGPU/seto.ll | 7 +- test/CodeGen/AMDGPU/setuo.ll | 7 +- test/CodeGen/AMDGPU/sext-eliminate.ll | 4 +- test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll | 2 +- test/CodeGen/AMDGPU/sext-in-reg.ll | 353 +- test/CodeGen/AMDGPU/sgpr-control-flow.ll | 8 +- test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll | 2 +- test/CodeGen/AMDGPU/sgpr-copy.ll | 205 +- test/CodeGen/AMDGPU/sgprcopies.ll | 58 + test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll | 16 +- test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll | 60 +- test/CodeGen/AMDGPU/shift-i64-opts.ll | 40 +- test/CodeGen/AMDGPU/shl.ll | 82 +- test/CodeGen/AMDGPU/shl.v2i16.ll | 152 + test/CodeGen/AMDGPU/shl_add_constant.ll | 10 +- test/CodeGen/AMDGPU/shl_add_ptr.ll | 36 +- test/CodeGen/AMDGPU/shrink-add-sub-constant.ll | 186 + test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir | 12 +- test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll | 47 +- test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll | 40 + test/CodeGen/AMDGPU/si-annotate-cf.ll | 8 +- test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll | 2 +- test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir | 2 +- .../si-instr-info-correct-implicit-operands.ll | 2 +- test/CodeGen/AMDGPU/si-literal-folding.ll | 14 - test/CodeGen/AMDGPU/si-lod-bias.ll | 49 +- .../si-lower-control-flow-unreachable-block.ll | 58 +- test/CodeGen/AMDGPU/si-scheduler.ll | 49 +- test/CodeGen/AMDGPU/si-sgpr-spill.ll | 843 +- test/CodeGen/AMDGPU/si-spill-cf.ll | 732 +- test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll | 8 +- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll | 39 +- test/CodeGen/AMDGPU/si-vector-hang.ll | 2 +- test/CodeGen/AMDGPU/sign_extend.ll | 26 +- test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 10 +- test/CodeGen/AMDGPU/sint_to_fp.i64.ll | 16 +- test/CodeGen/AMDGPU/sint_to_fp.ll | 16 +- test/CodeGen/AMDGPU/sitofp.f16.ll | 60 +- test/CodeGen/AMDGPU/skip-if-dead.ll | 9 +- test/CodeGen/AMDGPU/smed3.ll | 108 +- test/CodeGen/AMDGPU/sminmax.ll | 22 +- test/CodeGen/AMDGPU/sminmax.v2i16.ll | 224 + test/CodeGen/AMDGPU/smrd-vccz-bug.ll | 4 +- test/CodeGen/AMDGPU/smrd.ll | 114 +- test/CodeGen/AMDGPU/sopk-compares.ll | 76 +- test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll | 2 +- test/CodeGen/AMDGPU/spill-cfg-position.ll | 78 + test/CodeGen/AMDGPU/spill-m0.ll | 55 +- test/CodeGen/AMDGPU/spill-scavenge-offset.ll | 2 +- test/CodeGen/AMDGPU/spill-wide-sgpr.ll | 16 +- test/CodeGen/AMDGPU/split-scalar-i64-add.ll | 10 +- test/CodeGen/AMDGPU/split-smrd.ll | 29 +- .../AMDGPU/split-vector-memoperand-offsets.ll | 4 +- test/CodeGen/AMDGPU/splitkit.mir | 105 + test/CodeGen/AMDGPU/sra.ll | 38 +- test/CodeGen/AMDGPU/srem.ll | 26 +- test/CodeGen/AMDGPU/srl.ll | 16 +- test/CodeGen/AMDGPU/ssubo.ll | 10 +- test/CodeGen/AMDGPU/store-barrier.ll | 2 +- test/CodeGen/AMDGPU/store-global.ll | 46 +- test/CodeGen/AMDGPU/store-local.ll | 24 +- test/CodeGen/AMDGPU/store-private.ll | 48 +- test/CodeGen/AMDGPU/store-v3i64.ll | 16 +- test/CodeGen/AMDGPU/store-vector-ptrs.ll | 2 +- test/CodeGen/AMDGPU/store_typed.ll | 4 +- test/CodeGen/AMDGPU/structurize.ll | 2 +- test/CodeGen/AMDGPU/structurize1.ll | 2 +- test/CodeGen/AMDGPU/sub.i16.ll | 22 +- test/CodeGen/AMDGPU/sub.ll | 20 +- test/CodeGen/AMDGPU/sub.v2i16.ll | 278 + test/CodeGen/AMDGPU/subreg-coalescer-crash.ll | 50 +- test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll | 2 +- test/CodeGen/AMDGPU/subreg-eliminate-dead.ll | 2 +- test/CodeGen/AMDGPU/subreg-intervals.mir | 4 +- test/CodeGen/AMDGPU/subreg_interference.mir | 24 + test/CodeGen/AMDGPU/target-cpu.ll | 12 +- test/CodeGen/AMDGPU/trap.ll | 78 +- test/CodeGen/AMDGPU/trunc-bitcast-vector.ll | 16 +- test/CodeGen/AMDGPU/trunc-cmp-constant.ll | 26 +- test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll | 12 +- test/CodeGen/AMDGPU/trunc-store-i1.ll | 8 +- test/CodeGen/AMDGPU/trunc-store.ll | 4 +- .../AMDGPU/trunc-vector-store-assertion-failure.ll | 2 +- test/CodeGen/AMDGPU/trunc.ll | 30 +- test/CodeGen/AMDGPU/tti-unroll-prefs.ll | 2 +- test/CodeGen/AMDGPU/uaddo.ll | 119 +- test/CodeGen/AMDGPU/udiv.ll | 72 +- test/CodeGen/AMDGPU/udivrem.ll | 6 +- test/CodeGen/AMDGPU/udivrem24.ll | 34 +- test/CodeGen/AMDGPU/udivrem64.ll | 12 +- test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 18 +- test/CodeGen/AMDGPU/uint_to_fp.i64.ll | 16 +- test/CodeGen/AMDGPU/uint_to_fp.ll | 18 +- test/CodeGen/AMDGPU/uitofp.f16.ll | 56 +- test/CodeGen/AMDGPU/umed3.ll | 113 +- test/CodeGen/AMDGPU/unaligned-load-store.ll | 62 +- test/CodeGen/AMDGPU/undefined-subreg-liverange.ll | 12 +- .../AMDGPU/unhandled-loop-condition-assertion.ll | 6 +- .../AMDGPU/uniform-branch-intrinsic-cond.ll | 1 + test/CodeGen/AMDGPU/uniform-cfg.ll | 68 +- test/CodeGen/AMDGPU/uniform-crash.ll | 4 +- .../AMDGPU/uniform-loop-inside-nonuniform.ll | 6 +- test/CodeGen/AMDGPU/unify-metadata.ll | 4 - test/CodeGen/AMDGPU/unigine-liveness-crash.ll | 55 +- test/CodeGen/AMDGPU/unknown-processor.ll | 2 +- test/CodeGen/AMDGPU/unroll.ll | 68 +- test/CodeGen/AMDGPU/unsupported-cc.ll | 20 +- test/CodeGen/AMDGPU/urecip.ll | 13 - test/CodeGen/AMDGPU/urem.ll | 14 +- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll | 36 +- test/CodeGen/AMDGPU/usubo.ll | 114 +- test/CodeGen/AMDGPU/v1i64-kernel-arg.ll | 4 +- test/CodeGen/AMDGPU/v_cndmask.ll | 52 +- test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll | 12 +- test/CodeGen/AMDGPU/v_mac.ll | 49 +- test/CodeGen/AMDGPU/v_mac_f16.ll | 317 +- test/CodeGen/AMDGPU/v_madak_f16.ll | 8 +- test/CodeGen/AMDGPU/valu-i1.ll | 97 +- .../CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir | 4 +- test/CodeGen/AMDGPU/vector-alloca.ll | 10 +- test/CodeGen/AMDGPU/vector-extract-insert.ll | 8 +- test/CodeGen/AMDGPU/vectorize-global-local.ll | 80 + test/CodeGen/AMDGPU/vertex-fetch-encoding.ll | 8 +- .../vgpr-spill-emergency-stack-slot-compute.ll | 10 +- .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 105 +- test/CodeGen/AMDGPU/vi-removed-intrinsics.ll | 6 +- test/CodeGen/AMDGPU/vop-shrink.ll | 4 +- test/CodeGen/AMDGPU/vselect.ll | 8 +- test/CodeGen/AMDGPU/vselect64.ll | 2 +- test/CodeGen/AMDGPU/vtx-fetch-branch.ll | 2 +- test/CodeGen/AMDGPU/vtx-schedule.ll | 2 +- test/CodeGen/AMDGPU/wait.ll | 72 +- test/CodeGen/AMDGPU/waitcnt-flat.ll | 2 +- test/CodeGen/AMDGPU/waitcnt.mir | 75 +- test/CodeGen/AMDGPU/wqm.ll | 120 +- .../AMDGPU/write-register-vgpr-into-sgpr.ll | 2 +- test/CodeGen/AMDGPU/write_register.ll | 14 +- test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll | 2 +- test/CodeGen/AMDGPU/xfail.r600.bitcast.ll | 6 +- test/CodeGen/AMDGPU/xor.ll | 38 +- test/CodeGen/AMDGPU/zero_extend.ll | 10 +- test/CodeGen/AMDGPU/zext-i64-bit-operand.ll | 4 +- test/CodeGen/AMDGPU/zext-lid.ll | 83 + test/CodeGen/ARM/2007-05-22-tailmerge-3.ll | 8 +- test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll | 1 - test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll | 5 +- .../ARM/GlobalISel/arm-instruction-select.mir | 406 +- test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll | 567 +- test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll | 51 + test/CodeGen/ARM/GlobalISel/arm-isel.ll | 144 +- test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir | 282 + test/CodeGen/ARM/GlobalISel/arm-legalizer.mir | 233 +- test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir | 280 + test/CodeGen/ARM/alloc-no-stack-realign.ll | 100 +- test/CodeGen/ARM/arg-copy-elide.ll | 61 + test/CodeGen/ARM/arm-and-tst-peephole.ll | 58 +- test/CodeGen/ARM/arm-position-independence.ll | 144 +- test/CodeGen/ARM/atomic-cmpxchg.ll | 14 +- test/CodeGen/ARM/atomic-op.ll | 4 +- test/CodeGen/ARM/atomic-ops-v8.ll | 35 +- test/CodeGen/ARM/bfi.ll | 12 +- test/CodeGen/ARM/bic.ll | 13 +- test/CodeGen/ARM/bool-ext-inc.ll | 32 + test/CodeGen/ARM/build-attributes.ll | 77 + test/CodeGen/ARM/cmp1-peephole-thumb.mir | 78 + test/CodeGen/ARM/cmp2-peephole-thumb.mir | 108 + test/CodeGen/ARM/cmpxchg-weak.ll | 8 +- test/CodeGen/ARM/constantpool-promote.ll | 62 +- test/CodeGen/ARM/debug-info-s16-reg.ll | 2 - test/CodeGen/ARM/debug-info-sreg2.ll | 2 +- test/CodeGen/ARM/div.ll | 9 + test/CodeGen/ARM/fast-isel-align.ll | 4 +- test/CodeGen/ARM/fast-isel-cmp-imm.ll | 16 +- test/CodeGen/ARM/fold-stack-adjust.ll | 2 +- test/CodeGen/ARM/fp-only-sp.ll | 62 + test/CodeGen/ARM/fp16-promote.ll | 32 +- test/CodeGen/ARM/fp16-v3.ll | 2 +- test/CodeGen/ARM/fpcmp-opt.ll | 8 +- test/CodeGen/ARM/fpcmp.ll | 4 +- test/CodeGen/ARM/fpcmp_ueq.ll | 2 +- test/CodeGen/ARM/fpscr-intrinsics.ll | 44 + test/CodeGen/ARM/gpr-paired-spill.ll | 18 +- test/CodeGen/ARM/ifcvt10.ll | 2 - test/CodeGen/ARM/illegal-bitfield-loadstore.ll | 184 + test/CodeGen/ARM/indirectbr.ll | 1 + test/CodeGen/ARM/interval-update-remat.ll | 4 +- test/CodeGen/ARM/intrinsics-coprocessor.ll | 1 - test/CodeGen/ARM/ldm-stm-i256.ll | 38 + test/CodeGen/ARM/ldrd.ll | 28 +- test/CodeGen/ARM/load-combine-big-endian.ll | 779 ++ test/CodeGen/ARM/load-combine.ll | 692 ++ test/CodeGen/ARM/longMAC.ll | 262 +- test/CodeGen/ARM/lowerMUL-newload.ll | 115 + test/CodeGen/ARM/mature-mc-support.ll | 2 +- test/CodeGen/ARM/misched-fp-basic.ll | 69 + test/CodeGen/ARM/misched-int-basic-thumb2.mir | 175 + test/CodeGen/ARM/misched-int-basic.mir | 128 + test/CodeGen/ARM/movt.ll | 8 + test/CodeGen/ARM/msr-it-block.ll | 8 +- test/CodeGen/ARM/neon_vabs.ll | 95 +- test/CodeGen/ARM/no-cmov2bfi.ll | 19 + test/CodeGen/ARM/phi.ll | 1 - test/CodeGen/ARM/pr32545.ll | 22 + test/CodeGen/ARM/prera-ldst-aliasing.mir | 40 + test/CodeGen/ARM/prera-ldst-insertpt.mir | 105 + test/CodeGen/ARM/rbit.ll | 3 +- test/CodeGen/ARM/rev.ll | 14 +- test/CodeGen/ARM/select_const.ll | 326 + test/CodeGen/ARM/select_xform.ll | 12 +- test/CodeGen/ARM/setcc-logic.ll | 74 + test/CodeGen/ARM/setcc-sentinals.ll | 14 - test/CodeGen/ARM/single-issue-r52.mir | 86 + test/CodeGen/ARM/sjljeh-swifterror.ll | 27 + test/CodeGen/ARM/smml.ll | 43 +- test/CodeGen/ARM/smul.ll | 29 + test/CodeGen/ARM/softfp-fabs-fneg.ll | 3 +- test/CodeGen/ARM/special-reg-mcore.ll | 82 +- test/CodeGen/ARM/special-reg-v8m-main.ll | 8 +- test/CodeGen/ARM/stack_guard_remat.ll | 8 +- test/CodeGen/ARM/static-addr-hoisting.ll | 6 +- test/CodeGen/ARM/tail-opts.ll | 52 + test/CodeGen/ARM/thumb1-div.ll | 67 + test/CodeGen/ARM/unschedule-first-call.ll | 136 + test/CodeGen/ARM/v6-jumptable-clobber.mir | 384 + test/CodeGen/ARM/v8m-tail-call.ll | 23 + test/CodeGen/ARM/v8m.base-jumptable_alignment.ll | 51 + test/CodeGen/ARM/va_arg.ll | 8 +- test/CodeGen/ARM/vcmp-crash.ll | 11 + test/CodeGen/ARM/vldm-liveness.ll | 19 +- test/CodeGen/ARM/vldm-liveness.mir | 40 + test/CodeGen/ARM/vsel.ll | 8 +- test/CodeGen/ARM/vuzp.ll | 22 +- test/CodeGen/AVR/inline-asm/inline-asm.ll | 2 +- test/CodeGen/AVR/inline-asm/inline-asm2.ll | 2 +- test/CodeGen/AVR/inline-asm/multibyte.ll | 2 +- test/CodeGen/AVR/intrinsics/stacksave-restore.ll | 27 + test/CodeGen/AVR/no-print-operand-twice.ll | 8 + test/CodeGen/AVR/pseudo/ADCWRdRr.mir | 2 +- test/CodeGen/AVR/pseudo/ADDWRdRr.mir | 2 +- test/CodeGen/AVR/pseudo/ANDIWRdK.mir | 2 +- test/CodeGen/AVR/pseudo/ANDWRdRr.mir | 2 +- test/CodeGen/AVR/pseudo/ASRWRd.mir | 2 +- test/CodeGen/AVR/pseudo/COMWRd.mir | 2 +- test/CodeGen/AVR/pseudo/CPCWRdRr.mir | 2 +- test/CodeGen/AVR/pseudo/CPWRdRr.mir | 2 +- test/CodeGen/AVR/pseudo/EORWRdRr.mir | 2 +- test/CodeGen/AVR/pseudo/FRMIDX.mir | 2 +- test/CodeGen/AVR/pseudo/INWRdA.mir | 2 +- test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir | 5 +- test/CodeGen/AVR/pseudo/LDDWRdYQ.mir | 5 +- test/CodeGen/AVR/pseudo/LDIWRdK.mir | 2 +- test/CodeGen/AVR/pseudo/LDSWRdK.mir | 2 +- test/CodeGen/AVR/pseudo/LDWRdPtr.mir | 2 +- test/CodeGen/AVR/pseudo/LDWRdPtrPd.mir | 2 +- test/CodeGen/AVR/pseudo/LDWRdPtrPi.mir | 2 +- test/CodeGen/AVR/pseudo/LSLWRd.mir | 2 +- test/CodeGen/AVR/pseudo/LSRWRd.mir | 2 +- test/CodeGen/AVR/pseudo/ORIWRdK.mir | 2 +- test/CodeGen/AVR/pseudo/ORWRdRr.mir | 2 +- test/CodeGen/AVR/pseudo/OUTWARr.mir | 2 +- test/CodeGen/AVR/pseudo/POPWRd.mir | 2 +- test/CodeGen/AVR/pseudo/PUSHWRr.mir | 2 +- test/CodeGen/AVR/pseudo/SBCIWRdK.mir | 2 +- test/CodeGen/AVR/pseudo/SBCWRdRr.mir | 2 +- test/CodeGen/AVR/pseudo/SEXT.mir | 2 +- test/CodeGen/AVR/pseudo/STDWPtrQRr.mir | 2 +- test/CodeGen/AVR/pseudo/STSWKRr.mir | 2 +- test/CodeGen/AVR/pseudo/STWPtrPdRr.mir | 2 +- test/CodeGen/AVR/pseudo/STWPtrPiRr.mir | 2 +- test/CodeGen/AVR/pseudo/STWPtrRr.mir | 2 +- test/CodeGen/AVR/pseudo/SUBIWRdK.mir | 2 +- test/CodeGen/AVR/pseudo/SUBWRdRr.mir | 2 +- test/CodeGen/AVR/pseudo/ZEXT.mir | 2 +- .../AVR/pseudo/expand-lddw-dst-src-same.mir | 3 +- test/CodeGen/AVR/relax-mem/STDWPtrQRr.mir | 2 +- test/CodeGen/BPF/cc_args.ll | 2 +- test/CodeGen/BPF/cc_args_be.ll | 2 +- test/CodeGen/BPF/cc_ret.ll | 2 +- test/CodeGen/BPF/fi_ri.ll | 2 +- test/CodeGen/BPF/intrinsics.ll | 4 +- test/CodeGen/BPF/mem_offset.ll | 17 + test/CodeGen/BPF/objdump_intrinsics.ll | 4 +- test/CodeGen/BPF/objdump_trivial.ll | 11 +- test/CodeGen/BPF/sanity.ll | 2 +- test/CodeGen/BPF/undef.ll | 67 +- test/CodeGen/BPF/warn-call.ll | 69 + test/CodeGen/BPF/warn-stack.ll | 76 + test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll | 3 + .../Generic/2007-04-08-MultipleFrameIndices.ll | 3 + test/CodeGen/Generic/2007-12-17-InvokeAsm.ll | 2 + .../CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll | 4 + test/CodeGen/Generic/MachineBranchProb.ll | 6 +- test/CodeGen/Generic/externally_available.ll | 2 +- test/CodeGen/Generic/icmp-illegal.ll | 1 - test/CodeGen/Generic/inline-asm-mem-clobber.ll | 3 + ...ltiple-return-values-cross-block-with-invoke.ll | 1 - test/CodeGen/Generic/overloaded-intrinsic-name.ll | 35 +- test/CodeGen/Generic/pr24662.ll | 12 + test/CodeGen/Generic/select-cc.ll | 6 +- test/CodeGen/Generic/v-split.ll | 4 + test/CodeGen/Generic/vector-redux.ll | 3 + test/CodeGen/Generic/vector.ll | 3 + test/CodeGen/Hexagon/BranchPredict.ll | 6 +- test/CodeGen/Hexagon/adde.ll | 55 +- test/CodeGen/Hexagon/addh-sext-trunc.ll | 2 +- test/CodeGen/Hexagon/addh-shifted.ll | 2 +- test/CodeGen/Hexagon/addh.ll | 2 +- test/CodeGen/Hexagon/alu64.ll | 132 +- test/CodeGen/Hexagon/args.ll | 8 +- .../CodeGen/Hexagon/avoid-predspill-calleesaved.ll | 1 - test/CodeGen/Hexagon/bit-bitsplit-at.ll | 33 + test/CodeGen/Hexagon/bit-bitsplit-src.ll | 35 + test/CodeGen/Hexagon/bit-bitsplit.ll | 17 + test/CodeGen/Hexagon/bit-eval.ll | 2 +- test/CodeGen/Hexagon/bit-ext-sat.ll | 57 + test/CodeGen/Hexagon/bit-extract-off.ll | 23 + test/CodeGen/Hexagon/bit-extract.ll | 75 + test/CodeGen/Hexagon/bit-has.ll | 64 + test/CodeGen/Hexagon/bit-phi.ll | 1 + test/CodeGen/Hexagon/bit-rie.ll | 4 +- test/CodeGen/Hexagon/bit-skip-byval.ll | 2 +- test/CodeGen/Hexagon/bit-validate-reg.ll | 5 +- test/CodeGen/Hexagon/bitmanip.ll | 135 + test/CodeGen/Hexagon/block-addr.ll | 2 +- test/CodeGen/Hexagon/branchfolder-keep-impdef.ll | 2 +- test/CodeGen/Hexagon/brev_ld.ll | 12 +- test/CodeGen/Hexagon/brev_st.ll | 10 +- test/CodeGen/Hexagon/builtin-expect.ll | 44 + test/CodeGen/Hexagon/cext-valid-packet1.ll | 4 +- test/CodeGen/Hexagon/circ_ld.ll | 12 +- test/CodeGen/Hexagon/circ_ldw.ll | 2 +- test/CodeGen/Hexagon/circ_st.ll | 10 +- test/CodeGen/Hexagon/clr_set_toggle.ll | 30 +- test/CodeGen/Hexagon/cmp.ll | 22 +- test/CodeGen/Hexagon/combine.ll | 2 +- test/CodeGen/Hexagon/compound.ll | 4 +- test/CodeGen/Hexagon/constp-combine-neg.ll | 6 +- test/CodeGen/Hexagon/convert-to-dot-old.ll | 110 + test/CodeGen/Hexagon/ctlz-cttz-ctpop.ll | 36 - test/CodeGen/Hexagon/dead-store-stack.ll | 2 +- test/CodeGen/Hexagon/early-if-merge-loop.ll | 91 + test/CodeGen/Hexagon/early-if-phi-i1.ll | 2 +- test/CodeGen/Hexagon/early-if-vecpred.ll | 37 + test/CodeGen/Hexagon/eh_return.ll | 2 +- test/CodeGen/Hexagon/eliminate-pred-spill.ll | 5 +- test/CodeGen/Hexagon/expand-condsets-dead-bad.ll | 54 + test/CodeGen/Hexagon/expand-condsets-dead-pred.ll | 45 + test/CodeGen/Hexagon/expand-condsets-rm-reg.mir | 2 +- test/CodeGen/Hexagon/expand-vstorerw-undef2.ll | 216 + test/CodeGen/Hexagon/extload-combine.ll | 18 +- test/CodeGen/Hexagon/extract-basic.ll | 6 +- test/CodeGen/Hexagon/fadd.ll | 2 +- test/CodeGen/Hexagon/find-loop-instr.ll | 79 + test/CodeGen/Hexagon/float-amode.ll | 14 +- test/CodeGen/Hexagon/fmul.ll | 2 +- test/CodeGen/Hexagon/fsel.ll | 4 +- test/CodeGen/Hexagon/fsub.ll | 2 +- test/CodeGen/Hexagon/fusedandshift.ll | 4 +- test/CodeGen/Hexagon/gp-rel.ll | 4 +- test/CodeGen/Hexagon/hwloop-cleanup.ll | 6 +- test/CodeGen/Hexagon/hwloop-loop1.ll | 16 +- test/CodeGen/Hexagon/hwloop1.ll | 16 +- test/CodeGen/Hexagon/hwloop2.ll | 2 +- test/CodeGen/Hexagon/hwloop4.ll | 6 +- test/CodeGen/Hexagon/hwloop5.ll | 4 +- .../Hexagon/ifcvt-diamond-bug-2016-08-26.ll | 4 +- test/CodeGen/Hexagon/ifcvt-simple-bprob.ll | 36 + test/CodeGen/Hexagon/inline-asm-vecpred128.ll | 15 + test/CodeGen/Hexagon/insert-basic.ll | 8 +- test/CodeGen/Hexagon/insert4.ll | 4 +- test/CodeGen/Hexagon/intrinsics/alu32_alu.ll | 38 +- test/CodeGen/Hexagon/intrinsics/alu32_perm.ll | 24 +- .../Hexagon/intrinsics/byte-store-double.ll | 41 + test/CodeGen/Hexagon/intrinsics/byte-store.ll | 41 + test/CodeGen/Hexagon/intrinsics/cr.ll | 30 +- test/CodeGen/Hexagon/intrinsics/system_user.ll | 2 +- test/CodeGen/Hexagon/intrinsics/xtype_alu.ll | 254 +- test/CodeGen/Hexagon/intrinsics/xtype_bit.ll | 58 +- test/CodeGen/Hexagon/intrinsics/xtype_complex.ll | 94 +- test/CodeGen/Hexagon/intrinsics/xtype_fp.ll | 44 +- test/CodeGen/Hexagon/intrinsics/xtype_mpy.ll | 430 +- test/CodeGen/Hexagon/intrinsics/xtype_perm.ll | 16 +- test/CodeGen/Hexagon/intrinsics/xtype_pred.ll | 94 +- test/CodeGen/Hexagon/intrinsics/xtype_shift.ll | 202 +- test/CodeGen/Hexagon/isel-exti1.ll | 22 + test/CodeGen/Hexagon/isel-i1arg-crash.ll | 6 + test/CodeGen/Hexagon/isel-op-zext-i1.ll | 13 + .../CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll | 36 + .../CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll | 36 + test/CodeGen/Hexagon/loop-idiom/lcssa.ll | 46 + test/CodeGen/Hexagon/loop-idiom/nullptr-crash.ll | 24 + .../Hexagon/loop-idiom/pmpy-infinite-loop.ll | 83 + test/CodeGen/Hexagon/loop-idiom/pmpy-mod.ll | 84 + test/CodeGen/Hexagon/loop-idiom/pmpy.ll | 33 + test/CodeGen/Hexagon/memops-stack.ll | 36 +- test/CodeGen/Hexagon/newvalueSameReg.ll | 4 +- test/CodeGen/Hexagon/newvaluejump.ll | 2 +- test/CodeGen/Hexagon/newvaluejump2.ll | 2 +- test/CodeGen/Hexagon/newvaluejump3.ll | 79 + test/CodeGen/Hexagon/opt-addr-mode.ll | 4 +- test/CodeGen/Hexagon/opt-fabs.ll | 2 +- test/CodeGen/Hexagon/opt-fneg.ll | 6 +- test/CodeGen/Hexagon/opt-spill-volatile.ll | 10 +- test/CodeGen/Hexagon/pic-local.ll | 4 +- test/CodeGen/Hexagon/pic-simple.ll | 6 +- test/CodeGen/Hexagon/pic-static.ll | 6 +- test/CodeGen/Hexagon/pred-absolute-store.ll | 4 +- test/CodeGen/Hexagon/predicate-logical.ll | 2 +- test/CodeGen/Hexagon/predicate-rcmp.ll | 2 +- test/CodeGen/Hexagon/rdf-copy-undef2.ll | 4 +- test/CodeGen/Hexagon/rdf-inline-asm-fixed.ll | 8 +- test/CodeGen/Hexagon/rdf-phi-up.ll | 8 +- test/CodeGen/Hexagon/readcyclecounter.ll | 10 + test/CodeGen/Hexagon/regalloc-block-overlap.ll | 143 + test/CodeGen/Hexagon/ret-struct-by-val.ll | 2 +- test/CodeGen/Hexagon/runtime-stkchk.ll | 12 +- test/CodeGen/Hexagon/section_7275.ll | 10 +- test/CodeGen/Hexagon/signed_immediates.ll | 6 +- test/CodeGen/Hexagon/stack-align1.ll | 6 +- test/CodeGen/Hexagon/stack-align2.ll | 10 +- test/CodeGen/Hexagon/stack-alloca1.ll | 2 +- test/CodeGen/Hexagon/stack-alloca2.ll | 6 +- test/CodeGen/Hexagon/static.ll | 6 +- test/CodeGen/Hexagon/store-shift.ll | 12 +- test/CodeGen/Hexagon/sube.ll | 49 +- test/CodeGen/Hexagon/subi-asl.ll | 6 +- test/CodeGen/Hexagon/swp-const-tc.ll | 2 +- test/CodeGen/Hexagon/swp-matmul-bitext.ll | 2 +- test/CodeGen/Hexagon/swp-max.ll | 4 +- test/CodeGen/Hexagon/swp-multi-loops.ll | 8 +- test/CodeGen/Hexagon/swp-stages4.ll | 94 + test/CodeGen/Hexagon/swp-stages5.ll | 78 + test/CodeGen/Hexagon/swp-vmult.ll | 8 +- test/CodeGen/Hexagon/swp-vsum.ll | 6 +- test/CodeGen/Hexagon/tail-dup-subreg-map.ll | 2 +- test/CodeGen/Hexagon/tfr-to-combine.ll | 6 +- test/CodeGen/Hexagon/tls_pic.ll | 4 +- test/CodeGen/Hexagon/two-crash.ll | 2 +- test/CodeGen/Hexagon/undo-dag-shift.ll | 59 + test/CodeGen/Hexagon/vaddh.ll | 2 +- test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll | 2 +- test/CodeGen/Hexagon/vect/vect-loadv4i16.ll | 4 +- test/CodeGen/Hexagon/vect/vect-shift-imm.ll | 12 +- test/CodeGen/Hexagon/vect/vect-shuffle.ll | 2 +- test/CodeGen/Hexagon/vect/vect-vshifts.ll | 4 +- test/CodeGen/Hexagon/vect/vect-xor.ll | 2 +- test/CodeGen/MIR/AArch64/atomic-memoperands.mir | 30 + test/CodeGen/MIR/AArch64/register-operand-bank.mir | 20 + .../MIR/AMDGPU/expected-target-index-name.mir | 20 +- test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir | 18 +- test/CodeGen/MIR/AMDGPU/intrinsics.mir | 6 +- .../MIR/AMDGPU/invalid-target-index-operand.mir | 20 +- test/CodeGen/MIR/AMDGPU/target-index-operands.mir | 21 +- test/CodeGen/MIR/Generic/llvmIR.mir | 4 +- test/CodeGen/MIR/Generic/llvmIRMissing.mir | 4 +- .../machine-basic-block-ir-block-reference.mir | 2 +- .../machine-function-missing-body-error.mir | 15 - .../MIR/Generic/machine-function-missing-body.mir | 15 + .../Generic/machine-function-missing-function.mir | 4 - .../MIR/Generic/machine-function-missing-name.mir | 4 - test/CodeGen/MIR/Generic/machine-function.mir | 10 +- test/CodeGen/MIR/Generic/register-info.mir | 4 - test/CodeGen/MIR/Generic/runPass.mir | 2 +- test/CodeGen/MIR/X86/dynamic-regmask.ll | 30 + .../expected-named-register-in-allocation-hint.mir | 2 +- ...xpected-size-integer-after-memory-operation.mir | 2 +- .../MIR/X86/register-operand-class-invalid0.mir | 13 + .../MIR/X86/register-operand-class-invalid1.mir | 14 + test/CodeGen/MIR/X86/register-operand-class.mir | 27 + .../MIR/X86/used-physical-register-info.mir | 109 - test/CodeGen/MSP430/AddrMode-bis-rx.ll | 14 +- test/CodeGen/MSP430/AddrMode-bis-xr.ll | 14 +- test/CodeGen/MSP430/AddrMode-mov-rx.ll | 14 +- test/CodeGen/MSP430/AddrMode-mov-xr.ll | 14 +- test/CodeGen/MSP430/Inst16mm.ll | 2 +- test/CodeGen/MSP430/Inst16mr.ll | 12 +- test/CodeGen/MSP430/Inst16ri.ll | 10 +- test/CodeGen/MSP430/Inst16rm.ll | 10 +- test/CodeGen/MSP430/Inst16rr.ll | 12 +- test/CodeGen/MSP430/Inst8mr.ll | 12 +- test/CodeGen/MSP430/Inst8ri.ll | 10 +- test/CodeGen/MSP430/Inst8rm.ll | 10 +- test/CodeGen/MSP430/Inst8rr.ll | 10 +- test/CodeGen/MSP430/bit.ll | 20 +- test/CodeGen/MSP430/byval.ll | 2 +- test/CodeGen/MSP430/cc_args.ll | 63 +- test/CodeGen/MSP430/cc_ret.ll | 12 +- test/CodeGen/MSP430/jumptable.ll | 4 +- test/CodeGen/MSP430/memset.ll | 6 +- test/CodeGen/MSP430/setcc.ll | 56 +- test/CodeGen/MSP430/struct-return.ll | 23 + test/CodeGen/MSP430/vararg.ll | 10 +- test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll | 4 +- test/CodeGen/Mips/2010-07-20-Switch.ll | 53 +- .../CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll | 30 +- test/CodeGen/Mips/Fast-ISel/fastcc-miss.ll | 2 +- test/CodeGen/Mips/abicalls.ll | 12 +- test/CodeGen/Mips/blockaddr.ll | 12 +- test/CodeGen/Mips/brconnez.ll | 4 +- test/CodeGen/Mips/cconv/arguments-float.ll | 45 +- test/CodeGen/Mips/cconv/arguments-fp128.ll | 2 +- .../Mips/cconv/arguments-hard-float-varargs.ll | 4 +- test/CodeGen/Mips/cconv/arguments-hard-float.ll | 14 +- test/CodeGen/Mips/cconv/arguments-hard-fp128.ll | 2 +- test/CodeGen/Mips/cconv/arguments-struct.ll | 2 +- .../cconv/arguments-varargs-small-structs-byte.ll | 2 + ...arguments-varargs-small-structs-combinations.ll | 2 + test/CodeGen/Mips/cconv/arguments-varargs.ll | 62 +- test/CodeGen/Mips/cconv/arguments.ll | 6 +- test/CodeGen/Mips/cconv/return-float.ll | 6 +- test/CodeGen/Mips/cconv/return-hard-float.ll | 6 +- test/CodeGen/Mips/cconv/return-hard-fp128.ll | 4 +- test/CodeGen/Mips/cconv/return-hard-struct-f128.ll | 4 +- test/CodeGen/Mips/cconv/return-struct.ll | 36 +- test/CodeGen/Mips/cconv/return.ll | 12 +- test/CodeGen/Mips/cconv/roundl-call.ll | 10 +- test/CodeGen/Mips/cins.ll | 92 + .../Mips/compactbranches/compact-branches-64.ll | 3 +- .../Mips/compactbranches/compact-branches.ll | 10 +- test/CodeGen/Mips/compactbranches/empty-block.mir | 92 + test/CodeGen/Mips/cstmaterialization/stack.ll | 2 +- test/CodeGen/Mips/dext.ll | 105 + test/CodeGen/Mips/elf_eflags.ll | 9 +- test/CodeGen/Mips/fastcc.ll | 76 +- test/CodeGen/Mips/fcmp.ll | 50 +- test/CodeGen/Mips/fcopysign-f32-f64.ll | 4 +- test/CodeGen/Mips/global-address.ll | 12 +- test/CodeGen/Mips/inlineasm-constraint_ZC_2.ll | 6 +- test/CodeGen/Mips/llvm-ir/ashr.ll | 30 +- test/CodeGen/Mips/llvm-ir/call.ll | 24 +- test/CodeGen/Mips/llvm-ir/lshr.ll | 28 +- test/CodeGen/Mips/llvm-ir/shl.ll | 28 +- test/CodeGen/Mips/load-store-left-right.ll | 219 +- test/CodeGen/Mips/mature-mc-support.ll | 2 +- test/CodeGen/Mips/micromips-compact-branches.ll | 3 +- test/CodeGen/Mips/micromips-li.ll | 2 +- test/CodeGen/Mips/mips64-f128-call.ll | 27 +- test/CodeGen/Mips/mips64-f128.ll | 58 +- test/CodeGen/Mips/mips64-libcall.ll | 2 +- test/CodeGen/Mips/mips64instrs.ll | 12 +- test/CodeGen/Mips/mno-ldc1-sdc1.ll | 46 +- test/CodeGen/Mips/msa/3r_4r_widen.ll | 84 +- test/CodeGen/Mips/msa/basic_operations.ll | 59 +- test/CodeGen/Mips/msa/bitwise.ll | 16 +- test/CodeGen/Mips/msa/bmzi_bmnzi.ll | 55 + test/CodeGen/Mips/msa/f16-llvm-ir.ll | 14 +- test/CodeGen/Mips/msa/i5-b.ll | 8 +- test/CodeGen/Mips/msa/i5_ld_st.ll | 32 +- test/CodeGen/Mips/msa/immediates.ll | 10 +- test/CodeGen/Mips/o32_cc_byval.ll | 54 +- test/CodeGen/Mips/o32_cc_vararg.ll | 8 +- test/CodeGen/Mips/return_address.ll | 2 +- test/CodeGen/Mips/stackcoloring.ll | 8 +- test/CodeGen/Mips/start-asm-file.ll | 2 +- test/CodeGen/Mips/stchar.ll | 4 +- test/CodeGen/Mips/tailcall/tailcall-wrong-isa.ll | 21 +- test/CodeGen/Mips/tailcall/tailcall.ll | 69 +- test/CodeGen/Mips/tnaked.ll | 2 +- .../Mips/xray-mips-attribute-instrumentation.ll | 147 + test/CodeGen/Mips/xray-section-group.ll | 31 + test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 34 + test/CodeGen/NVPTX/access-non-generic.ll | 3 +- test/CodeGen/NVPTX/add-128bit.ll | 2 +- test/CodeGen/NVPTX/aggregate-return.ll | 35 +- test/CodeGen/NVPTX/bug22322.ll | 8 +- test/CodeGen/NVPTX/combine-min-max.ll | 134 +- test/CodeGen/NVPTX/convert-fp.ll | 115 +- test/CodeGen/NVPTX/ctlz.ll | 128 +- test/CodeGen/NVPTX/f16-instructions.ll | 1063 +++ test/CodeGen/NVPTX/f16x2-instructions.ll | 1426 +++ test/CodeGen/NVPTX/fast-math.ll | 137 +- test/CodeGen/NVPTX/fcos-no-fast-math.ll | 14 + test/CodeGen/NVPTX/fsin-no-fast-math.ll | 14 + test/CodeGen/NVPTX/global-variable-big.ll | 9 + test/CodeGen/NVPTX/half.ll | 8 +- test/CodeGen/NVPTX/idioms.ll | 31 + test/CodeGen/NVPTX/intrinsics.ll | 101 +- test/CodeGen/NVPTX/ldg-invariant.ll | 24 + test/CodeGen/NVPTX/ldparam-v4.ll | 5 +- test/CodeGen/NVPTX/lower-aggr-copies.ll | 44 +- test/CodeGen/NVPTX/lower-alloca.ll | 2 +- test/CodeGen/NVPTX/math-intrins.ll | 25 + test/CodeGen/NVPTX/misaligned-vector-ldst.ll | 58 + test/CodeGen/NVPTX/named-barriers.ll | 40 + test/CodeGen/NVPTX/nvvm-reflect.ll | 65 +- test/CodeGen/NVPTX/param-load-store.ll | 939 ++ test/CodeGen/NVPTX/rsqrt.ll | 13 - test/CodeGen/NVPTX/sqrt-approx.ll | 150 + test/CodeGen/NVPTX/vec-param-load.ll | 83 +- test/CodeGen/NVPTX/vec8.ll | 13 +- test/CodeGen/NVPTX/vector-call.ll | 22 +- .../PowerPC/2006-07-07-ComputeMaskedBits.ll | 36 +- .../CodeGen/PowerPC/2007-11-16-landingpad-split.ll | 1 - test/CodeGen/PowerPC/BreakableToken-reduced.ll | 4 +- test/CodeGen/PowerPC/aantidep-def-ec.mir | 16 - test/CodeGen/PowerPC/addegluecrash.ll | 58 + test/CodeGen/PowerPC/addi-licm.ll | 8 +- test/CodeGen/PowerPC/anon_aggr.ll | 59 +- test/CodeGen/PowerPC/atomics-regression.ll | 9546 ++++++++++++++++++++ test/CodeGen/PowerPC/bitcasts-direct-move.ll | 4 +- test/CodeGen/PowerPC/branch_coalesce.ll | 31 + test/CodeGen/PowerPC/complex-return.ll | 12 +- test/CodeGen/PowerPC/crbit-asm.ll | 7 + test/CodeGen/PowerPC/crbits.ll | 9 +- test/CodeGen/PowerPC/ctrloop-i128.ll | 34 + test/CodeGen/PowerPC/ctrloop-intrin.ll | 12 +- test/CodeGen/PowerPC/expand-contiguous-isel.ll | 151 + test/CodeGen/PowerPC/expand-isel-1.mir | 57 + test/CodeGen/PowerPC/expand-isel-2.mir | 57 + test/CodeGen/PowerPC/expand-isel-3.mir | 58 + test/CodeGen/PowerPC/expand-isel-4.mir | 59 + test/CodeGen/PowerPC/expand-isel-5.mir | 54 + test/CodeGen/PowerPC/expand-isel-6.mir | 57 + test/CodeGen/PowerPC/expand-isel-7.mir | 58 + test/CodeGen/PowerPC/expand-isel-8.mir | 65 + test/CodeGen/PowerPC/expand-isel.ll | 227 + test/CodeGen/PowerPC/fast-isel-load-store.ll | 2 +- test/CodeGen/PowerPC/fma-aggr-FMF.ll | 35 + test/CodeGen/PowerPC/fold-zero.ll | 21 +- .../PowerPC/fp-int-conversions-direct-moves.ll | 8 +- .../PowerPC/fp128-bitcast-after-operation.ll | 32 +- test/CodeGen/PowerPC/i1-ext-fold.ll | 25 + test/CodeGen/PowerPC/i1-to-double.ll | 22 +- test/CodeGen/PowerPC/i64_fp_round.ll | 11 + test/CodeGen/PowerPC/ifcvt.ll | 11 +- test/CodeGen/PowerPC/indirectbr.ll | 36 +- test/CodeGen/PowerPC/isel.ll | 19 +- test/CodeGen/PowerPC/jaggedstructs.ll | 52 +- test/CodeGen/PowerPC/lsa.ll | 16 +- test/CodeGen/PowerPC/mature-mc-support.ll | 2 +- test/CodeGen/PowerPC/mcm-obj.ll | 5 +- test/CodeGen/PowerPC/misched-inorder-latency.ll | 4 +- test/CodeGen/PowerPC/optcmp.ll | 32 +- test/CodeGen/PowerPC/p8-isel-sched.ll | 13 +- .../PowerPC/p8-scalar_vector_conversions.ll | 601 +- test/CodeGen/PowerPC/ppc-crbits-onoff.ll | 13 + test/CodeGen/PowerPC/ppc-shrink-wrapping.ll | 2 +- test/CodeGen/PowerPC/ppc64-align-long-double.ll | 41 +- test/CodeGen/PowerPC/ppc64-gep-opt.ll | 4 +- test/CodeGen/PowerPC/ppc64le-aggregates.ll | 5 +- test/CodeGen/PowerPC/pr30451.ll | 20 +- test/CodeGen/PowerPC/pr32063.ll | 16 + test/CodeGen/PowerPC/pr32140.ll | 59 + test/CodeGen/PowerPC/pristine-and-livein.mir | 330 + test/CodeGen/PowerPC/select-i1-vs-i1.ll | 186 +- test/CodeGen/PowerPC/select_const.ll | 789 ++ test/CodeGen/PowerPC/setcc-logic.ll | 478 + test/CodeGen/PowerPC/setcc-to-sub.ll | 73 +- test/CodeGen/PowerPC/sjlj_no0x.ll | 29 + test/CodeGen/PowerPC/srl-mask.ll | 11 + test/CodeGen/PowerPC/stacksize.ll | 86 + test/CodeGen/PowerPC/structsinmem.ll | 28 +- test/CodeGen/PowerPC/structsinregs.ll | 60 +- test/CodeGen/PowerPC/subreg-postra-2.ll | 7 + test/CodeGen/PowerPC/subreg-postra.ll | 6 + test/CodeGen/PowerPC/subtract_from_imm.ll | 41 + test/CodeGen/PowerPC/swaps-le-4.ll | 8 +- test/CodeGen/PowerPC/swaps-le-7.ll | 4 +- .../PowerPC/tail-dup-branch-to-fallthrough.ll | 6 +- test/CodeGen/PowerPC/tail-dup-break-cfg.ll | 140 + test/CodeGen/PowerPC/tail-dup-layout.ll | 494 +- test/CodeGen/PowerPC/toc-load-sched-bug.ll | 28 +- test/CodeGen/PowerPC/vec_absd.ll | 4 +- test/CodeGen/PowerPC/vec_cmp.ll | 40 +- test/CodeGen/PowerPC/vsx-args.ll | 12 +- test/CodeGen/PowerPC/vsx-infl-copy1.ll | 18 +- test/CodeGen/PowerPC/vsx-p9.ll | 12 +- test/CodeGen/SPARC/mature-mc-support.ll | 2 +- test/CodeGen/SPARC/register-clobber.ll | 35 + test/CodeGen/SPARC/reserved-regs.ll | 4 +- test/CodeGen/SPARC/sjlj.ll | 11 +- test/CodeGen/SystemZ/DAGCombine_trunc_extract.ll | 18 + .../SystemZ/DAGCombiner_illegal_BUILD_VECTOR.ll | 26 + test/CodeGen/SystemZ/expand-zext-pseudo.ll | 132 + test/CodeGen/SystemZ/extract-vector-elt-zEC12.ll | 21 + test/CodeGen/SystemZ/fold-memory-op-impl.ll | 129 + test/CodeGen/SystemZ/fp-cmp-05.ll | 8 +- test/CodeGen/SystemZ/int-cmp-44.ll | 6 +- test/CodeGen/SystemZ/locr-legal-regclass.ll | 20 + test/CodeGen/SystemZ/mature-mc-support.ll | 2 +- test/CodeGen/SystemZ/memchr-01.ll | 54 +- test/CodeGen/SystemZ/memchr-02.ll | 57 - test/CodeGen/SystemZ/memcmp-02.ll | 139 - test/CodeGen/SystemZ/pr32372.ll | 31 + test/CodeGen/SystemZ/pr32505.ll | 20 + .../SystemZ/splitMove_undefReg_mverifier.ll | 413 + test/CodeGen/SystemZ/stack-guard.ll | 8 +- test/CodeGen/SystemZ/strcmp-02.ll | 72 - test/CodeGen/SystemZ/strlen-02.ll | 39 - test/CodeGen/SystemZ/unaligned-01.ll | 5 +- test/CodeGen/SystemZ/undef-flag.ll | 22 + test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll | 5784 ++++++++++++ test/CodeGen/SystemZ/vec-cmpsel.ll | 3378 +++++++ test/CodeGen/SystemZ/vec-sext.ll | 91 + test/CodeGen/SystemZ/vec-trunc-to-i1.ll | 37 + test/CodeGen/SystemZ/vec-zext.ll | 91 + test/CodeGen/SystemZ/vectorizer-output-3xi32.ll | 10 + test/CodeGen/Thumb/2010-07-15-debugOrdering.ll | 2 +- test/CodeGen/Thumb/PR17309.ll | 16 +- test/CodeGen/Thumb/cmp-add-fold.ll | 3 +- test/CodeGen/Thumb/copy_thumb.ll | 12 +- test/CodeGen/Thumb/ispositive.ll | 9 + test/CodeGen/Thumb/long.ll | 150 +- test/CodeGen/Thumb/mature-mc-support.ll | 2 +- test/CodeGen/Thumb/remove-unneeded-push-pop.ll | 1052 +++ test/CodeGen/Thumb/stack-access.ll | 26 +- .../Thumb/stack-coloring-without-frame-ptr.ll | 10 +- test/CodeGen/Thumb/stack_guard_remat.ll | 8 +- test/CodeGen/Thumb/stm-deprecated.ll | 19 + test/CodeGen/Thumb/tbb-reuse.mir | 151 + test/CodeGen/Thumb/thumb-shrink-wrapping.ll | 22 +- test/CodeGen/Thumb2/cbnz.ll | 2 +- test/CodeGen/Thumb2/float-cmp.ll | 16 +- test/CodeGen/Thumb2/ifcvt-compare.ll | 2 +- test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll | 4 +- test/CodeGen/Thumb2/intrinsics-coprocessor.ll | 93 + test/CodeGen/Thumb2/stack_guard_remat.ll | 8 +- test/CodeGen/Thumb2/tbb-removeadd.mir | 124 + test/CodeGen/Thumb2/thumb2-pack.ll | 2 +- test/CodeGen/Thumb2/thumb2-rev.ll | 2 +- test/CodeGen/Thumb2/thumb2-smla.ll | 4 +- test/CodeGen/Thumb2/thumb2-smul.ll | 2 +- test/CodeGen/Thumb2/thumb2-sxt-uxt.ll | 51 +- test/CodeGen/Thumb2/thumb2-sxt_rot.ll | 31 +- test/CodeGen/Thumb2/thumb2-uxt_rot.ll | 41 +- test/CodeGen/Thumb2/thumb2-uxtb.ll | 120 +- test/CodeGen/Thumb2/v8_IT_4.ll | 5 +- test/CodeGen/WebAssembly/address-offsets.ll | 4 +- test/CodeGen/WebAssembly/byval.ll | 25 +- test/CodeGen/WebAssembly/call.ll | 25 +- test/CodeGen/WebAssembly/cfg-stackify.ll | 8 +- test/CodeGen/WebAssembly/cfi.ll | 2 +- test/CodeGen/WebAssembly/comparisons_f32.ll | 104 +- test/CodeGen/WebAssembly/comparisons_f64.ll | 104 +- test/CodeGen/WebAssembly/comparisons_i32.ll | 24 +- test/CodeGen/WebAssembly/comparisons_i64.ll | 24 +- test/CodeGen/WebAssembly/conv.ll | 4 +- test/CodeGen/WebAssembly/copysign-casts.ll | 10 +- test/CodeGen/WebAssembly/cpus.ll | 16 +- test/CodeGen/WebAssembly/dbgvalue.ll | 4 +- test/CodeGen/WebAssembly/dead-vreg.ll | 2 +- test/CodeGen/WebAssembly/divrem-constant.ll | 2 +- test/CodeGen/WebAssembly/f16.ll | 29 + test/CodeGen/WebAssembly/f32.ll | 64 +- test/CodeGen/WebAssembly/f64.ll | 64 +- test/CodeGen/WebAssembly/fast-isel-noreg.ll | 2 +- test/CodeGen/WebAssembly/fast-isel.ll | 3 +- test/CodeGen/WebAssembly/frem.ll | 4 +- test/CodeGen/WebAssembly/func.ll | 6 +- test/CodeGen/WebAssembly/function-bitcasts.ll | 32 +- test/CodeGen/WebAssembly/global.ll | 54 +- test/CodeGen/WebAssembly/globl.ll | 2 +- test/CodeGen/WebAssembly/i128.ll | 2 +- .../WebAssembly/i32-load-store-alignment.ll | 4 +- test/CodeGen/WebAssembly/i32.ll | 88 +- .../WebAssembly/i64-load-store-alignment.ll | 4 +- test/CodeGen/WebAssembly/i64.ll | 88 +- test/CodeGen/WebAssembly/ident.ll | 2 +- test/CodeGen/WebAssembly/immediates.ll | 2 +- test/CodeGen/WebAssembly/implicit-def.ll | 2 +- test/CodeGen/WebAssembly/inline-asm.ll | 5 +- test/CodeGen/WebAssembly/irreducible-cfg.ll | 4 +- test/CodeGen/WebAssembly/legalize.ll | 4 +- test/CodeGen/WebAssembly/load-ext.ll | 4 +- test/CodeGen/WebAssembly/load-store-i1.ll | 4 +- test/CodeGen/WebAssembly/load.ll | 14 +- .../CodeGen/WebAssembly/lower-em-ehsjlj-options.ll | 2 +- .../WebAssembly/lower-em-exceptions-whitelist.ll | 2 +- test/CodeGen/WebAssembly/lower-em-exceptions.ll | 2 +- test/CodeGen/WebAssembly/lower-em-sjlj.ll | 2 +- test/CodeGen/WebAssembly/mem-intrinsics.ll | 4 +- test/CodeGen/WebAssembly/memory-addr32.ll | 17 +- test/CodeGen/WebAssembly/non-executable-stack.ll | 2 +- test/CodeGen/WebAssembly/offset-folding.ll | 2 +- test/CodeGen/WebAssembly/offset.ll | 4 +- test/CodeGen/WebAssembly/phi.ll | 9 +- test/CodeGen/WebAssembly/reg-stackify.ll | 8 +- test/CodeGen/WebAssembly/return-int32.ll | 8 +- test/CodeGen/WebAssembly/return-void.ll | 6 +- test/CodeGen/WebAssembly/returned.ll | 4 +- test/CodeGen/WebAssembly/select.ll | 6 +- test/CodeGen/WebAssembly/signext-zeroext.ll | 4 +- test/CodeGen/WebAssembly/simd-arith.ll | 10 +- test/CodeGen/WebAssembly/stack-alignment.ll | 102 +- test/CodeGen/WebAssembly/store-trunc.ll | 4 +- test/CodeGen/WebAssembly/store.ll | 18 +- test/CodeGen/WebAssembly/switch.ll | 2 +- test/CodeGen/WebAssembly/unreachable.ll | 2 +- .../WebAssembly/unsupported-function-bitcasts.ll | 7 +- test/CodeGen/WebAssembly/unused-argument.ll | 4 +- test/CodeGen/WebAssembly/userstack.ll | 173 +- test/CodeGen/WebAssembly/varargs.ll | 11 +- test/CodeGen/WebAssembly/vtable.ll | 13 +- test/CodeGen/X86/2003-11-03-GlobalBool.ll | 6 +- test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll | 19 +- .../X86/2004-02-14-InefficientStackPointer.ll | 9 +- test/CodeGen/X86/2005-01-17-CycleInDAG.ll | 17 +- test/CodeGen/X86/2005-02-14-IllegalAssembler.ll | 3 +- test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll | 35 +- test/CodeGen/X86/2006-03-01-InstrSchedBug.ll | 28 +- test/CodeGen/X86/2006-03-02-InstrSchedBug.ll | 17 +- test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll | 1 - test/CodeGen/X86/2008-02-14-BitMiscompile.ll | 3 +- .../X86/2010-04-30-LocalAlloc-LandingPad.ll | 5 +- test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll | 58 +- test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll | 2 +- test/CodeGen/X86/2011-10-19-widen_vselect.ll | 100 +- test/CodeGen/X86/2011-10-21-widen-cmp.ll | 6 +- test/CodeGen/X86/2011-11-30-or.ll | 2 +- test/CodeGen/X86/2011-12-15-vec_shift.ll | 2 +- test/CodeGen/X86/2011-12-8-bitcastintprom.ll | 1 - test/CodeGen/X86/2012-07-10-extload64.ll | 32 +- test/CodeGen/X86/2012-11-28-merge-store-alias.ll | 2 +- test/CodeGen/X86/DynamicCalleeSavedRegisters.ll | 60 + test/CodeGen/X86/GlobalISel/X86-regbankselect.mir | 634 ++ test/CodeGen/X86/GlobalISel/binop-isel.ll | 186 + test/CodeGen/X86/GlobalISel/constant.ll | 54 + .../GlobalISel/frameIndex-instructionselect.mir | 36 + test/CodeGen/X86/GlobalISel/frameIndex.ll | 30 + test/CodeGen/X86/GlobalISel/irtranslator-call.ll | 1 + .../X86/GlobalISel/irtranslator-callingconv.ll | 310 + .../GlobalISel/irtranslator-callingconv_64bit.ll | 29 + test/CodeGen/X86/GlobalISel/legalize-add.mir | 40 + test/CodeGen/X86/GlobalISel/legalize-const.mir | 43 + test/CodeGen/X86/GlobalISel/legalize-sub.mir | 40 + test/CodeGen/X86/GlobalISel/memop-isel.ll | 189 + test/CodeGen/X86/GlobalISel/select-constant.mir | 143 + .../X86/GlobalISel/x86_64-instructionselect.mir | 1022 +++ test/CodeGen/X86/MergeConsecutiveStores.ll | 18 +- test/CodeGen/X86/StackColoring-dbg.ll | 8 +- test/CodeGen/X86/StackColoring.ll | 194 +- test/CodeGen/X86/absolute-cmp.ll | 39 + test/CodeGen/X86/absolute-rotate.ll | 4 +- test/CodeGen/X86/add-of-carry.ll | 31 +- test/CodeGen/X86/adde-carry.ll | 180 +- test/CodeGen/X86/aes_intrinsics.ll | 64 +- test/CodeGen/X86/and-sink.ll | 181 + test/CodeGen/X86/arg-copy-elide.ll | 299 + test/CodeGen/X86/atomic128.ll | 504 +- test/CodeGen/X86/avg.ll | 871 +- test/CodeGen/X86/avx-cvt-3.ll | 148 + test/CodeGen/X86/avx-cvt.ll | 3 +- test/CodeGen/X86/avx-intrinsics-fast-isel.ll | 40 +- test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll | 4 +- test/CodeGen/X86/avx-intrinsics-x86.ll | 3005 +----- test/CodeGen/X86/avx-intrinsics-x86_64.ll | 92 +- test/CodeGen/X86/avx-shuffle-x86_32.ll | 3 +- test/CodeGen/X86/avx-trunc.ll | 2 + test/CodeGen/X86/avx-vbroadcast.ll | 314 +- test/CodeGen/X86/avx-vperm2x128.ll | 3 +- test/CodeGen/X86/avx-vzeroupper.ll | 5 +- test/CodeGen/X86/avx2-conversions.ll | 4 +- test/CodeGen/X86/avx2-gather.ll | 60 + test/CodeGen/X86/avx2-intrinsics-fast-isel.ll | 26 + test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll | 30 +- test/CodeGen/X86/avx2-intrinsics-x86.ll | 133 +- test/CodeGen/X86/avx2-shift.ll | 12 +- test/CodeGen/X86/avx2-vbroadcast.ll | 430 +- test/CodeGen/X86/avx2-vector-shifts.ll | 12 +- test/CodeGen/X86/avx512-adc-sbb.ll | 27 + test/CodeGen/X86/avx512-any_extend_load.ll | 21 +- test/CodeGen/X86/avx512-arith.ll | 1 + test/CodeGen/X86/avx512-bugfix-26264.ll | 26 +- test/CodeGen/X86/avx512-calling-conv.ll | 5 +- test/CodeGen/X86/avx512-cmp-kor-sequence.ll | 1 + test/CodeGen/X86/avx512-cmp.ll | 72 +- test/CodeGen/X86/avx512-cvt.ll | 525 +- test/CodeGen/X86/avx512-ext.ll | 411 +- test/CodeGen/X86/avx512-extract-subvector.ll | 95 +- test/CodeGen/X86/avx512-fsel.ll | 35 +- test/CodeGen/X86/avx512-gather-scatter-intrin.ll | 226 +- test/CodeGen/X86/avx512-insert-extract.ll | 1496 ++- test/CodeGen/X86/avx512-insert-extract_i1.ll | 37 + test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 19 + test/CodeGen/X86/avx512-intrinsics.ll | 928 +- test/CodeGen/X86/avx512-load-store.ll | 189 +- test/CodeGen/X86/avx512-logic.ll | 50 +- test/CodeGen/X86/avx512-mask-op.ll | 2197 ++++- test/CodeGen/X86/avx512-mask-spills.ll | 4 + test/CodeGen/X86/avx512-masked-memop-64-32.ll | 60 +- test/CodeGen/X86/avx512-masked_memop-16-8.ll | 13 +- test/CodeGen/X86/avx512-memfold.ll | 73 + test/CodeGen/X86/avx512-mov.ll | 2 +- test/CodeGen/X86/avx512-pmovxrm.ll | 6 +- test/CodeGen/X86/avx512-regcall-Mask.ll | 14 +- test/CodeGen/X86/avx512-regcall-NoMask.ll | 37 +- test/CodeGen/X86/avx512-select.ll | 8 +- test/CodeGen/X86/avx512-skx-insert-subvec.ll | 19 +- test/CodeGen/X86/avx512-trunc.ll | 277 + test/CodeGen/X86/avx512-vbroadcast.ll | 22 +- test/CodeGen/X86/avx512-vbroadcasti128.ll | 24 +- test/CodeGen/X86/avx512-vec-cmp.ll | 161 +- test/CodeGen/X86/avx512-vpermv3-commute.ll | 18 +- test/CodeGen/X86/avx512-vpternlog-commute.ll | 817 +- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll | 572 ++ test/CodeGen/X86/avx512bw-intrinsics.ll | 240 +- test/CodeGen/X86/avx512bw-mask-op.ll | 90 + test/CodeGen/X86/avx512bw-vec-cmp.ll | 36 +- .../CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll | 24 +- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll | 891 +- test/CodeGen/X86/avx512bwvl-intrinsics.ll | 2330 ++--- test/CodeGen/X86/avx512bwvl-vec-cmp.ll | 72 +- test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll | 47 + test/CodeGen/X86/avx512cd-intrinsics.ll | 98 +- test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll | 71 + test/CodeGen/X86/avx512cdvl-intrinsics.ll | 84 +- test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll | 38 +- test/CodeGen/X86/avx512dq-intrinsics.ll | 123 +- test/CodeGen/X86/avx512dq-mask-op.ll | 8 +- test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll | 186 +- test/CodeGen/X86/avx512dqvl-intrinsics.ll | 201 +- test/CodeGen/X86/avx512er-intrinsics.ll | 125 +- test/CodeGen/X86/avx512ifma-intrinsics.ll | 97 +- test/CodeGen/X86/avx512ifmavl-intrinsics.ll | 97 +- test/CodeGen/X86/avx512vbmivl-intrinsics.ll | 38 +- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll | 157 +- test/CodeGen/X86/avx512vl-intrinsics.ll | 2092 ++++- test/CodeGen/X86/avx512vl-logic.ll | 72 +- test/CodeGen/X86/avx512vl-vbroadcast.ll | 30 +- test/CodeGen/X86/avx512vl-vec-cmp.ll | 144 +- test/CodeGen/X86/bc-extract.ll | 37 +- test/CodeGen/X86/bitcast-mmx.ll | 126 +- test/CodeGen/X86/bitreverse.ll | 573 +- test/CodeGen/X86/block-placement.ll | 69 +- test/CodeGen/X86/block-placement.mir | 4 +- test/CodeGen/X86/bool-ext-inc.ll | 32 + test/CodeGen/X86/branchfolding-debugloc.ll | 83 + test/CodeGen/X86/brcond.ll | 39 - test/CodeGen/X86/break-false-dep.ll | 57 + test/CodeGen/X86/bt.ll | 8 +- test/CodeGen/X86/buildvec-insertvec.ll | 517 +- test/CodeGen/X86/bypass-slow-division-32.ll | 15 +- test/CodeGen/X86/catchpad-lifetime.ll | 12 +- test/CodeGen/X86/catchpad-weight.ll | 8 +- test/CodeGen/X86/chain_order.ll | 4 +- .../CodeGen/X86/clear_upper_vector_element_bits.ll | 1296 ++- test/CodeGen/X86/clflushopt.ll | 13 + test/CodeGen/X86/clzero.ll | 23 + test/CodeGen/X86/cmov.ll | 5 +- test/CodeGen/X86/cmovcmov.ll | 19 +- .../code_placement_outline_optional_branches.ll | 77 - test/CodeGen/X86/codegen-prepare-addrmode-sext.ll | 41 +- test/CodeGen/X86/codegen-prepare-extload.ll | 15 +- test/CodeGen/X86/codegen-prepare.ll | 1 - test/CodeGen/X86/combine-abs.ll | 99 + test/CodeGen/X86/combine-and.ll | 25 + test/CodeGen/X86/combine-fcopysign.ll | 2 +- test/CodeGen/X86/combine-shl.ll | 30 +- test/CodeGen/X86/combine-testm-and.ll | 10 +- test/CodeGen/X86/combiner-aa-0.ll | 20 - test/CodeGen/X86/combiner-aa-1.ll | 23 - test/CodeGen/X86/commute-3dnow.ll | 270 + test/CodeGen/X86/commute-clmul.ll | 73 +- test/CodeGen/X86/commute-fcmp.ll | 38 +- test/CodeGen/X86/commute-xop.ll | 247 +- test/CodeGen/X86/compare-global.ll | 2 +- test/CodeGen/X86/complex-fastmath.ll | 215 + test/CodeGen/X86/compress_expand.ll | 102 +- test/CodeGen/X86/conditional-indecrement.ll | 96 +- test/CodeGen/X86/conditional-tailcall.ll | 163 + test/CodeGen/X86/copy-eflags.ll | 17 +- test/CodeGen/X86/copy-propagation.ll | 33 +- test/CodeGen/X86/crash.ll | 4 +- test/CodeGen/X86/ctpop-combine.ll | 10 +- test/CodeGen/X86/dag-fmf-cse.ll | 2 +- test/CodeGen/X86/dag-merge-fast-accesses.ll | 12 +- test/CodeGen/X86/dagcombine-and-setcc.ll | 7 +- test/CodeGen/X86/dagcombine-cse.ll | 37 +- .../X86/dbg-changes-codegen-branch-folding.ll | 12 +- test/CodeGen/X86/div-rem-simplify.ll | 187 + test/CodeGen/X86/divrem8_ext.ll | 3 +- .../X86/dont-trunc-store-double-to-float.ll | 6 +- test/CodeGen/X86/dropped_constructor.ll | 19 + test/CodeGen/X86/dwarf-headers.ll | 109 + test/CodeGen/X86/dynamic-alloca-lifetime.ll | 12 +- test/CodeGen/X86/elf-associated.ll | 39 + test/CodeGen/X86/evex-to-vex-compress.mir | 230 +- test/CodeGen/X86/extract-store.ll | 577 +- test/CodeGen/X86/extractelement-index.ll | 58 +- .../extractelement-legalization-store-ordering.ll | 15 +- test/CodeGen/X86/fadd-combines.ll | 2 +- test/CodeGen/X86/fast-isel-abort-warm.ll | 14 + test/CodeGen/X86/fast-isel-cmp.ll | 1194 ++- test/CodeGen/X86/fast-isel-deadcode.ll | 8 +- test/CodeGen/X86/fast-isel-load-i1.ll | 12 +- test/CodeGen/X86/fast-isel-nontemporal.ll | 47 +- test/CodeGen/X86/fast-isel-select-cmov.ll | 63 +- test/CodeGen/X86/fast-isel-select-sse.ll | 92 +- test/CodeGen/X86/fast-isel-x86-64.ll | 2 +- test/CodeGen/X86/fast-isel-x86.ll | 2 +- test/CodeGen/X86/fast-isel.ll | 4 +- test/CodeGen/X86/fentry-insertion.ll | 16 + test/CodeGen/X86/file-source-filename.ll | 4 + test/CodeGen/X86/fma-fneg-combine.ll | 32 +- test/CodeGen/X86/fma.ll | 402 +- test/CodeGen/X86/fma_patterns.ll | 2 +- test/CodeGen/X86/fold-vector-sext-zext.ll | 9 +- test/CodeGen/X86/fp-intrinsics.ll | 111 + test/CodeGen/X86/fp-select-cmp-and.ll | 32 +- test/CodeGen/X86/fp-une-cmp.ll | 4 +- test/CodeGen/X86/fp128-cast.ll | 2 +- test/CodeGen/X86/fp128-compare.ll | 6 +- test/CodeGen/X86/fp128-g.ll | 3 +- test/CodeGen/X86/fp128-i128.ll | 249 +- test/CodeGen/X86/fp128-libcalls.ll | 6 +- test/CodeGen/X86/fp128-load.ll | 6 +- test/CodeGen/X86/fp128-select.ll | 12 +- test/CodeGen/X86/huge-stack-offset2.ll | 62 + test/CodeGen/X86/i256-add.ll | 122 +- test/CodeGen/X86/i386-shrink-wrapping.ll | 5 +- test/CodeGen/X86/illegal-bitfield-loadstore.ll | 141 + test/CodeGen/X86/implicit-null-check.ll | 83 +- test/CodeGen/X86/implicit-null-checks.mir | 902 +- test/CodeGen/X86/implicit-use-spill.mir | 2 +- test/CodeGen/X86/imul.ll | 230 + test/CodeGen/X86/inline-asm-A-constraint.ll | 35 + test/CodeGen/X86/inline-asm-tied.ll | 30 +- test/CodeGen/X86/insertelement-zero.ll | 180 +- test/CodeGen/X86/isel-sink.ll | 1 - test/CodeGen/X86/jump_sign.ll | 4 +- test/CodeGen/X86/known-bits-vector.ll | 151 +- test/CodeGen/X86/known-bits.ll | 170 + test/CodeGen/X86/known-signbits-vector.ll | 139 + test/CodeGen/X86/lea-opt-with-debug.mir | 122 + test/CodeGen/X86/lfence.ll | 8 - test/CodeGen/X86/licm-nested.ll | 2 +- test/CodeGen/X86/live-range-nosubreg.ll | 5 +- test/CodeGen/X86/load-combine.ll | 1314 +++ test/CodeGen/X86/load-slice.ll | 12 +- test/CodeGen/X86/local_stack_symbol_ordering.ll | 36 +- test/CodeGen/X86/logical-load-fold.ll | 4 +- test/CodeGen/X86/longlong-deadload.ll | 24 +- test/CodeGen/X86/lzcnt-zext-cmp.ll | 241 +- test/CodeGen/X86/machine-outliner-debuginfo.ll | 75 + test/CodeGen/X86/machine-outliner-tailcalls.ll | 35 + test/CodeGen/X86/machine-outliner.ll | 110 + test/CodeGen/X86/machine-region-info.mir | 83 + test/CodeGen/X86/machine-trace-metrics-crash.ll | 4 +- test/CodeGen/X86/madd.ll | 103 + test/CodeGen/X86/masked_gather_scatter.ll | 173 +- test/CodeGen/X86/masked_memop.ll | 70 +- test/CodeGen/X86/mature-mc-support.ll | 2 +- test/CodeGen/X86/memcmp.ll | 446 +- test/CodeGen/X86/mempcpy-32.ll | 20 + test/CodeGen/X86/mempcpy.ll | 10 +- test/CodeGen/X86/merge-consecutive-loads-128.ll | 50 +- test/CodeGen/X86/merge-consecutive-loads-256.ll | 14 +- test/CodeGen/X86/merge-consecutive-loads-512.ll | 90 +- .../X86/merge-store-partially-alias-loads.ll | 8 +- test/CodeGen/X86/merge_store.ll | 1 - test/CodeGen/X86/merge_store_duplicated_loads.ll | 88 + test/CodeGen/X86/misched-aa-colored.ll | 10 +- test/CodeGen/X86/mmx-cvt.ll | 369 + test/CodeGen/X86/mmx-fold-load.ll | 581 +- test/CodeGen/X86/mul-constant-i16.ll | 589 ++ test/CodeGen/X86/mul-constant-i32.ll | 515 ++ test/CodeGen/X86/mul-constant-i64.ll | 581 ++ test/CodeGen/X86/mul-i256.ll | 296 +- test/CodeGen/X86/mulx32.ll | 21 +- test/CodeGen/X86/mulx64.ll | 20 +- test/CodeGen/X86/neg_cmp.ll | 46 +- test/CodeGen/X86/negative-sin.ll | 2 +- test/CodeGen/X86/nontemporal-2.ll | 29 +- test/CodeGen/X86/nontemporal-loads.ll | 24 +- test/CodeGen/X86/oddshuffles.ll | 34 +- test/CodeGen/X86/overflow.ll | 64 + test/CodeGen/X86/peep-setb.ll | 79 +- test/CodeGen/X86/peep-test-4.ll | 206 +- test/CodeGen/X86/pmul.ll | 224 +- test/CodeGen/X86/pointer-vector.ll | 4 +- test/CodeGen/X86/pr11334.ll | 10 +- test/CodeGen/X86/pr12312.ll | 308 +- test/CodeGen/X86/pr14204.ll | 6 +- test/CodeGen/X86/pr14314.ll | 33 +- test/CodeGen/X86/pr16031.ll | 31 +- test/CodeGen/X86/pr17764.ll | 3 +- test/CodeGen/X86/pr18014.ll | 2 +- test/CodeGen/X86/pr18023.ll | 31 - test/CodeGen/X86/pr18344.ll | 89 + test/CodeGen/X86/pr22338.ll | 57 + test/CodeGen/X86/pr26350.ll | 18 +- test/CodeGen/X86/pr2656.ll | 29 +- test/CodeGen/X86/pr27591.ll | 14 +- test/CodeGen/X86/pr28173.ll | 27 +- test/CodeGen/X86/pr29112.ll | 5 +- test/CodeGen/X86/pr29170.ll | 23 +- test/CodeGen/X86/pr30284.ll | 6 +- test/CodeGen/X86/pr30430.ll | 56 +- test/CodeGen/X86/pr30562.ll | 22 + test/CodeGen/X86/pr30693.ll | 147 - test/CodeGen/X86/pr31773.ll | 20 + test/CodeGen/X86/pr32108.ll | 26 + test/CodeGen/X86/pr32241.ll | 86 + test/CodeGen/X86/pr32256.ll | 59 + test/CodeGen/X86/pr32278.ll | 11 + test/CodeGen/X86/pr32284.ll | 117 + test/CodeGen/X86/pr32329.ll | 126 + test/CodeGen/X86/pr32340.ll | 77 + test/CodeGen/X86/pr32345.ll | 169 + test/CodeGen/X86/pr32420.ll | 36 + test/CodeGen/X86/pr32451.ll | 69 + test/CodeGen/X86/pr32484.ll | 32 + test/CodeGen/X86/pr32588.ll | 27 + test/CodeGen/X86/pre-coalesce-2.ll | 281 + test/CodeGen/X86/pre-coalesce.ll | 51 + test/CodeGen/X86/pre-coalesce.mir | 122 + test/CodeGen/X86/prefixdata.ll | 27 +- test/CodeGen/X86/promote-vec3.ll | 23 +- test/CodeGen/X86/psubus.ll | 338 +- test/CodeGen/X86/recip-fastmath.ll | 699 +- test/CodeGen/X86/recip-fastmath2.ll | 1064 ++- test/CodeGen/X86/recip-pic.ll | 27 + test/CodeGen/X86/reduce-trunc-shl.ll | 2 +- test/CodeGen/X86/regparm.ll | 48 + test/CodeGen/X86/rot32.ll | 19 +- test/CodeGen/X86/rot64.ll | 39 +- test/CodeGen/X86/rotate.ll | 83 + test/CodeGen/X86/rtm.ll | 65 +- test/CodeGen/X86/sad.ll | 881 +- test/CodeGen/X86/sad_variations.ll | 347 + test/CodeGen/X86/safestack.ll | 5 + test/CodeGen/X86/safestack_ssp.ll | 7 + test/CodeGen/X86/scalar-int-to-fp.ll | 761 +- test/CodeGen/X86/select.ll | 530 ++ test/CodeGen/X86/select_const.ll | 244 +- test/CodeGen/X86/select_meta.ll | 2 +- test/CodeGen/X86/selectiondag-order.ll | 97 + test/CodeGen/X86/setcc-logic.ll | 482 + test/CodeGen/X86/setcc-lowering.ll | 25 + test/CodeGen/X86/setcc-sentinals.ll | 13 - test/CodeGen/X86/setcc-wide-types.ll | 140 + test/CodeGen/X86/setcc.ll | 24 +- test/CodeGen/X86/sext-i1.ll | 97 +- test/CodeGen/X86/sfence.ll | 8 - test/CodeGen/X86/sha.ll | 6 +- test/CodeGen/X86/shrink-compare.ll | 6 +- test/CodeGen/X86/shrink_vmul.ll | 4 +- test/CodeGen/X86/shuffle-combine-crash-2.ll | 20 + test/CodeGen/X86/shuffle-of-splat-multiuses.ll | 100 + test/CodeGen/X86/shuffle-vs-trunc-256.ll | 52 +- test/CodeGen/X86/shuffle-vs-trunc-512.ll | 58 +- test/CodeGen/X86/split-extend-vector-inreg.ll | 47 + test/CodeGen/X86/split-store.ll | 27 +- test/CodeGen/X86/sse-align-10.ll | 5 +- test/CodeGen/X86/sse-fsignum.ll | 52 +- test/CodeGen/X86/sse-intrinsics-fast-isel.ll | 8 +- test/CodeGen/X86/sse-intrinsics-x86.ll | 42 +- test/CodeGen/X86/sse-intrinsics-x86_64.ll | 78 + test/CodeGen/X86/sse-minmax.ll | 290 +- test/CodeGen/X86/sse-regcall.ll | 54 +- test/CodeGen/X86/sse1.ll | 34 +- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll | 4 +- test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll | 8 +- test/CodeGen/X86/sse2-intrinsics-x86.ll | 82 +- test/CodeGen/X86/sse2-intrinsics-x86_64.ll | 78 + test/CodeGen/X86/sse3-avx-addsub.ll | 9 +- test/CodeGen/X86/sse3-intrinsics-x86.ll | 34 +- test/CodeGen/X86/sse41-intrinsics-fast-isel.ll | 28 +- test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll | 13 + test/CodeGen/X86/sse41-intrinsics-x86.ll | 16 +- test/CodeGen/X86/sse41.ll | 74 +- test/CodeGen/X86/sse42-intrinsics-fast-isel.ll | 22 +- test/CodeGen/X86/sse42-intrinsics-x86.ll | 63 +- test/CodeGen/X86/sse42-intrinsics-x86_64.ll | 28 + test/CodeGen/X86/sse42.ll | 58 - test/CodeGen/X86/sse42_64.ll | 21 - test/CodeGen/X86/ssse3-intrinsics-x86.ll | 29 + test/CodeGen/X86/stack-align.ll | 8 +- test/CodeGen/X86/stack-folding-adx-x86_64.ll | 18 + test/CodeGen/X86/stack-folding-bmi.ll | 121 + test/CodeGen/X86/stack-folding-bmi2.ll | 77 + test/CodeGen/X86/stack-folding-fp-avx1.ll | 22 - test/CodeGen/X86/stack-folding-fp-avx512vl.ll | 39 + test/CodeGen/X86/stack-folding-fp-sse42.ll | 4 +- test/CodeGen/X86/stack-folding-int-avx512.ll | 1572 +++- test/CodeGen/X86/stack-folding-int-avx512vl.ll | 2269 +++-- test/CodeGen/X86/stack-folding-int-sse42.ll | 2 +- test/CodeGen/X86/stack-folding-sha.ll | 72 + test/CodeGen/X86/stack-folding-tbm.ll | 201 + test/CodeGen/X86/stack-protector-remarks.ll | 103 + test/CodeGen/X86/stack-protector-target.ll | 40 +- test/CodeGen/X86/stack-protector-weight.ll | 8 +- test/CodeGen/X86/stack_guard_remat.ll | 8 +- test/CodeGen/X86/stores-merging.ll | 11 +- test/CodeGen/X86/subvector-broadcast.ll | 97 +- test/CodeGen/X86/swifterror.ll | 27 + test/CodeGen/X86/tail-call-conditional.mir | 85 + test/CodeGen/X86/tail-dup-debugloc.ll | 56 + test/CodeGen/X86/tail-dup-no-other-successor.ll | 53 + test/CodeGen/X86/tail-dup-repeat.ll | 2 +- test/CodeGen/X86/tail-merge-debugloc.ll | 42 + test/CodeGen/X86/tail-merge-identical.ll | 41 + test/CodeGen/X86/tail-merge-unreachable.ll | 2 +- test/CodeGen/X86/tail-opts.ll | 126 +- test/CodeGen/X86/twoaddr-coalesce-3.ll | 4 +- test/CodeGen/X86/unaligned-32-byte-memops.ll | 2 +- test/CodeGen/X86/unreachableblockelim.ll | 2 +- test/CodeGen/X86/unused_stackslots.ll | 8 +- test/CodeGen/X86/unwindraise.ll | 8 +- test/CodeGen/X86/update-terminator-debugloc.ll | 91 + test/CodeGen/X86/vec_cast2.ll | 22 +- test/CodeGen/X86/vec_extract-mmx.ll | 35 +- test/CodeGen/X86/vec_fp_to_int.ll | 57 +- test/CodeGen/X86/vec_fpext.ll | 4 + test/CodeGen/X86/vec_fptrunc.ll | 4 +- test/CodeGen/X86/vec_int_to_fp.ll | 97 +- test/CodeGen/X86/vec_logical.ll | 23 +- test/CodeGen/X86/vec_minmax_match.ll | 77 +- test/CodeGen/X86/vec_minmax_sint.ll | 48 +- test/CodeGen/X86/vec_minmax_uint.ll | 88 +- test/CodeGen/X86/vec_sdiv_to_shift.ll | 91 +- test/CodeGen/X86/vec_shift4.ll | 12 +- test/CodeGen/X86/vec_shift5.ll | 12 +- test/CodeGen/X86/vec_shift7.ll | 11 +- test/CodeGen/X86/vec_uint_to_fp-fastmath.ll | 8 +- test/CodeGen/X86/vec_unsafe-fp-math.ll | 2 +- test/CodeGen/X86/vec_zero_cse.ll | 68 +- test/CodeGen/X86/vector-bitreverse.ll | 103 +- test/CodeGen/X86/vector-blend.ll | 18 +- test/CodeGen/X86/vector-compare-all_of.ll | 946 ++ test/CodeGen/X86/vector-compare-any_of.ll | 882 ++ test/CodeGen/X86/vector-compare-results.ll | 1628 ++-- test/CodeGen/X86/vector-extend-inreg.ll | 120 + test/CodeGen/X86/vector-half-conversions.ll | 217 +- test/CodeGen/X86/vector-idiv-sdiv-128.ll | 47 +- test/CodeGen/X86/vector-idiv-sdiv-256.ll | 8 +- test/CodeGen/X86/vector-idiv-udiv-128.ll | 71 +- test/CodeGen/X86/vector-idiv-udiv-256.ll | 6 +- test/CodeGen/X86/vector-interleave.ll | 62 +- test/CodeGen/X86/vector-lzcnt-128.ll | 62 +- test/CodeGen/X86/vector-lzcnt-256.ll | 100 +- test/CodeGen/X86/vector-popcnt-256.ll | 18 +- test/CodeGen/X86/vector-rotate-128.ll | 288 +- test/CodeGen/X86/vector-rotate-256.ll | 225 +- test/CodeGen/X86/vector-sext.ll | 90 +- test/CodeGen/X86/vector-shift-ashr-128.ll | 91 +- test/CodeGen/X86/vector-shift-ashr-256.ll | 43 +- test/CodeGen/X86/vector-shift-ashr-512.ll | 5 +- test/CodeGen/X86/vector-shift-lshr-128.ll | 64 +- test/CodeGen/X86/vector-shift-lshr-256.ll | 78 +- test/CodeGen/X86/vector-shift-lshr-512.ll | 5 +- test/CodeGen/X86/vector-shift-shl-128.ll | 60 +- test/CodeGen/X86/vector-shift-shl-256.ll | 72 +- test/CodeGen/X86/vector-shift-shl-512.ll | 5 +- test/CodeGen/X86/vector-shuffle-128-v16.ll | 62 +- test/CodeGen/X86/vector-shuffle-128-v2.ll | 8 +- test/CodeGen/X86/vector-shuffle-256-v16.ll | 57 +- test/CodeGen/X86/vector-shuffle-256-v32.ll | 122 +- test/CodeGen/X86/vector-shuffle-256-v4.ll | 194 +- test/CodeGen/X86/vector-shuffle-256-v8.ll | 61 +- test/CodeGen/X86/vector-shuffle-512-v16.ll | 206 +- test/CodeGen/X86/vector-shuffle-512-v32.ll | 7 +- test/CodeGen/X86/vector-shuffle-512-v8.ll | 100 +- test/CodeGen/X86/vector-shuffle-avx512.ll | 1019 ++- test/CodeGen/X86/vector-shuffle-combining-avx2.ll | 192 +- .../X86/vector-shuffle-combining-avx512bw.ll | 74 +- .../X86/vector-shuffle-combining-avx512bwvl.ll | 2 +- .../X86/vector-shuffle-combining-avx512vbmi.ll | 2 +- test/CodeGen/X86/vector-shuffle-combining-sse41.ll | 23 + test/CodeGen/X86/vector-shuffle-combining-ssse3.ll | 108 + test/CodeGen/X86/vector-shuffle-combining-xop.ll | 37 +- test/CodeGen/X86/vector-shuffle-masked.ll | 297 +- test/CodeGen/X86/vector-shuffle-v1.ll | 72 +- test/CodeGen/X86/vector-shuffle-variable-128.ll | 1070 +-- test/CodeGen/X86/vector-shuffle-variable-256.ll | 316 +- test/CodeGen/X86/vector-sqrt.ll | 28 +- test/CodeGen/X86/vector-trunc-math.ll | 623 +- test/CodeGen/X86/vector-trunc.ll | 54 +- test/CodeGen/X86/vector-tzcnt-128.ll | 40 +- test/CodeGen/X86/vector-tzcnt-256.ll | 30 +- test/CodeGen/X86/vector-zext.ll | 332 +- test/CodeGen/X86/vectorcall.ll | 4 +- test/CodeGen/X86/viabs.ll | 146 +- test/CodeGen/X86/vselect-minmax.ll | 448 +- test/CodeGen/X86/vselect-pcmp.ll | 323 + test/CodeGen/X86/vsplit-and.ll | 8 +- test/CodeGen/X86/wide-integer-cmp.ll | 156 +- test/CodeGen/X86/widen_bitops-0.ll | 12 +- test/CodeGen/X86/widen_conv-1.ll | 4 - test/CodeGen/X86/widen_conv-3.ll | 8 +- test/CodeGen/X86/widen_conv-4.ll | 8 +- test/CodeGen/X86/widen_load-2.ll | 30 +- test/CodeGen/X86/widened-broadcast.ll | 42 +- test/CodeGen/X86/win-alloca-expander.ll | 24 +- test/CodeGen/X86/win32-eh.ll | 157 +- test/CodeGen/X86/win64_eh_leaf2.ll | 22 + test/CodeGen/X86/x32-va_start.ll | 8 +- test/CodeGen/X86/x86-64-intrcc-nosse.ll | 20 + test/CodeGen/X86/x86-64-intrcc.ll | 17 +- test/CodeGen/X86/x86-interleaved-access.ll | 34 +- test/CodeGen/X86/x86-sanitizer-shrink-wrapping.ll | 2 +- test/CodeGen/X86/xaluo.ll | 1518 +++- test/CodeGen/X86/xmulo.ll | 788 +- test/CodeGen/X86/xop-ifma.ll | 129 + test/CodeGen/X86/xop-intrinsics-fast-isel.ll | 14 +- test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll | 39 + test/CodeGen/X86/xop-intrinsics-x86_64.ll | 32 +- test/CodeGen/X86/xop-mask-comments.ll | 8 +- test/CodeGen/X86/xor-combine-debugloc.ll | 69 + test/CodeGen/X86/xray-log-args.ll | 35 + test/CodeGen/XCore/fneg.ll | 4 +- test/CodeGen/XCore/section-name.ll | 9 + test/CodeGen/XCore/varargs.ll | 2 +- 2067 files changed, 136850 insertions(+), 40310 deletions(-) create mode 100644 test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll delete mode 100644 test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/debug-insts.ll create mode 100644 test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll create mode 100644 test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll create mode 100644 test/CodeGen/AArch64/GlobalISel/gisel-fail-intermediate-legalizer.ll create mode 100644 test/CodeGen/AArch64/GlobalISel/inline-asm.ll create mode 100644 test/CodeGen/AArch64/GlobalISel/irtranslator-bitcast.ll create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-exceptions.ll create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-fneg.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-pow.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-shift.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/no-regclass.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/regbankselect-reg_sequence.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-binop.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-bitcast.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-br.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-cbz.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-constant.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-int-ext.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-int-ptr-casts.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-load.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-muladd.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-property.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-store.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-trunc.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-xor.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/varargs-ios-translator.ll create mode 100644 test/CodeGen/AArch64/GlobalISel/vastart.ll create mode 100644 test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll create mode 100644 test/CodeGen/AArch64/aarch64-fold-lslfast.ll create mode 100644 test/CodeGen/AArch64/aarch64-named-reg-w18.ll create mode 100644 test/CodeGen/AArch64/aarch64-named-reg-x18.ll create mode 100644 test/CodeGen/AArch64/and-sink.ll create mode 100644 test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll create mode 100644 test/CodeGen/AArch64/arm64-spill-remarks.ll create mode 100644 test/CodeGen/AArch64/arm64-summary-remarks.ll create mode 100644 test/CodeGen/AArch64/br-cond-not-merge.ll create mode 100644 test/CodeGen/AArch64/dag-numsignbits.ll create mode 100644 test/CodeGen/AArch64/fast-isel-tail-call.ll create mode 100644 test/CodeGen/AArch64/ldst-opt-aa.mir create mode 100644 test/CodeGen/AArch64/live-interval-analysis.mir create mode 100644 test/CodeGen/AArch64/load-combine-big-endian.ll create mode 100644 test/CodeGen/AArch64/load-combine.ll create mode 100644 test/CodeGen/AArch64/machine-copy-remove.mir create mode 100644 test/CodeGen/AArch64/machine-outliner.ll create mode 100644 test/CodeGen/AArch64/misched-fusion-aes.ll create mode 100644 test/CodeGen/AArch64/misched-fusion-lit.ll create mode 100644 test/CodeGen/AArch64/neon-fma-FMF.ll create mode 100644 test/CodeGen/AArch64/pr27816.ll create mode 100644 test/CodeGen/AArch64/prefixdata.ll create mode 100644 test/CodeGen/AArch64/selectiondag-order.ll delete mode 100644 test/CodeGen/AArch64/tail-dup-repeat-worklist.ll create mode 100644 test/CodeGen/AArch64/tailcall-string-rvo.ll create mode 100644 test/CodeGen/AArch64/thread-pointer.ll create mode 100644 test/CodeGen/AArch64/xray-tail-call-sled.ll create mode 100644 test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir create mode 100644 test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir create mode 100644 test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir create mode 100644 test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir create mode 100644 test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll create mode 100644 test/CodeGen/AMDGPU/GlobalISel/smrd.ll create mode 100644 test/CodeGen/AMDGPU/add.v2i16.ll create mode 100644 test/CodeGen/AMDGPU/addrspacecast-captured.ll delete mode 100644 test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll delete mode 100644 test/CodeGen/AMDGPU/amdgcn.sendmsg.ll create mode 100644 test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll create mode 100644 test/CodeGen/AMDGPU/any_extend_vector_inreg.ll create mode 100644 test/CodeGen/AMDGPU/ashr.v2i16.ll create mode 100644 test/CodeGen/AMDGPU/barrier-elimination.ll create mode 100644 test/CodeGen/AMDGPU/bfe-patterns.ll create mode 100644 test/CodeGen/AMDGPU/clamp-modifier.ll create mode 100644 test/CodeGen/AMDGPU/clamp-omod-special-case.mir create mode 100644 test/CodeGen/AMDGPU/clamp.ll create mode 100644 test/CodeGen/AMDGPU/code-object-metadata-deduce-ro-arg.ll create mode 100644 test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll create mode 100644 test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll create mode 100644 test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll create mode 100644 test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll create mode 100644 test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll create mode 100644 test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll create mode 100644 test/CodeGen/AMDGPU/ds-combine-large-stride.ll create mode 100644 test/CodeGen/AMDGPU/early-if-convert-cost.ll create mode 100644 test/CodeGen/AMDGPU/early-if-convert.ll create mode 100644 test/CodeGen/AMDGPU/early-inline-alias.ll create mode 100644 test/CodeGen/AMDGPU/early-inline.ll create mode 100644 test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll create mode 100644 test/CodeGen/AMDGPU/env-amdgiz.ll create mode 100644 test/CodeGen/AMDGPU/env-amdgizcl.ll create mode 100644 test/CodeGen/AMDGPU/extract_vector_elt-f16.ll create mode 100644 test/CodeGen/AMDGPU/fcopysign.f16.ll create mode 100644 test/CodeGen/AMDGPU/fix-vgpr-copies.mir create mode 100644 test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll create mode 100644 test/CodeGen/AMDGPU/fmuladd.v2f16.ll create mode 100644 test/CodeGen/AMDGPU/fold-cndmask.mir create mode 100644 test/CodeGen/AMDGPU/fold-immediate-output-mods.mir create mode 100644 test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll create mode 100644 test/CodeGen/AMDGPU/immv216.ll create mode 100644 test/CodeGen/AMDGPU/inlineasm-packed.ll create mode 100644 test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir create mode 100644 test/CodeGen/AMDGPU/insert-waits-callee.mir create mode 100644 test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll create mode 100644 test/CodeGen/AMDGPU/internalize.ll delete mode 100644 test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll delete mode 100644 test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll delete mode 100644 test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll create mode 100644 test/CodeGen/AMDGPU/limit-coalesce.mir delete mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.SI.export.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.SI.gather4.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.SI.getlod.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.SI.image.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.SI.image.sample.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll delete mode 100644 test/CodeGen/AMDGPU/llvm.SI.packf16.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll create mode 100644 test/CodeGen/AMDGPU/llvm.r600.cube.ll create mode 100644 test/CodeGen/AMDGPU/lower-mem-intrinsics.ll delete mode 100644 test/CodeGen/AMDGPU/lshl.ll delete mode 100644 test/CodeGen/AMDGPU/lshr.ll create mode 100644 test/CodeGen/AMDGPU/lshr.v2i16.ll delete mode 100644 test/CodeGen/AMDGPU/mulhu.ll create mode 100644 test/CodeGen/AMDGPU/multi-divergent-exit-region.ll create mode 100644 test/CodeGen/AMDGPU/nested-loop-conditions.ll create mode 100644 test/CodeGen/AMDGPU/nop-data.ll create mode 100644 test/CodeGen/AMDGPU/nullptr.ll create mode 100644 test/CodeGen/AMDGPU/omod.ll create mode 100644 test/CodeGen/AMDGPU/pack.v2f16.ll create mode 100644 test/CodeGen/AMDGPU/pack.v2i16.ll create mode 100644 test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll create mode 100644 test/CodeGen/AMDGPU/r600.alu-limits.ll create mode 100644 test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll create mode 100644 test/CodeGen/AMDGPU/r600.global_atomics.ll create mode 100644 test/CodeGen/AMDGPU/regcoalesce-dbg.mir delete mode 100644 test/CodeGen/AMDGPU/runtime-metadata.ll create mode 100644 test/CodeGen/AMDGPU/schedule-regpressure-limit.ll create mode 100644 test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll create mode 100644 test/CodeGen/AMDGPU/schedule-regpressure.mir create mode 100644 test/CodeGen/AMDGPU/sdwa-peephole.ll create mode 100644 test/CodeGen/AMDGPU/setcc-fneg-constant.ll create mode 100644 test/CodeGen/AMDGPU/sgprcopies.ll create mode 100644 test/CodeGen/AMDGPU/shl.v2i16.ll create mode 100644 test/CodeGen/AMDGPU/shrink-add-sub-constant.ll create mode 100644 test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll delete mode 100644 test/CodeGen/AMDGPU/si-literal-folding.ll create mode 100644 test/CodeGen/AMDGPU/sminmax.v2i16.ll create mode 100644 test/CodeGen/AMDGPU/spill-cfg-position.ll create mode 100644 test/CodeGen/AMDGPU/splitkit.mir create mode 100644 test/CodeGen/AMDGPU/sub.v2i16.ll create mode 100644 test/CodeGen/AMDGPU/subreg_interference.mir delete mode 100644 test/CodeGen/AMDGPU/urecip.ll create mode 100644 test/CodeGen/AMDGPU/vectorize-global-local.ll create mode 100644 test/CodeGen/AMDGPU/zext-lid.ll create mode 100644 test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll create mode 100644 test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir create mode 100644 test/CodeGen/ARM/arg-copy-elide.ll create mode 100644 test/CodeGen/ARM/bool-ext-inc.ll create mode 100644 test/CodeGen/ARM/cmp1-peephole-thumb.mir create mode 100644 test/CodeGen/ARM/cmp2-peephole-thumb.mir create mode 100644 test/CodeGen/ARM/fp-only-sp.ll create mode 100644 test/CodeGen/ARM/fpscr-intrinsics.ll create mode 100644 test/CodeGen/ARM/illegal-bitfield-loadstore.ll create mode 100644 test/CodeGen/ARM/ldm-stm-i256.ll create mode 100644 test/CodeGen/ARM/load-combine-big-endian.ll create mode 100644 test/CodeGen/ARM/load-combine.ll create mode 100644 test/CodeGen/ARM/lowerMUL-newload.ll create mode 100644 test/CodeGen/ARM/misched-fp-basic.ll create mode 100644 test/CodeGen/ARM/misched-int-basic-thumb2.mir create mode 100644 test/CodeGen/ARM/misched-int-basic.mir create mode 100644 test/CodeGen/ARM/no-cmov2bfi.ll create mode 100644 test/CodeGen/ARM/pr32545.ll create mode 100644 test/CodeGen/ARM/prera-ldst-aliasing.mir create mode 100644 test/CodeGen/ARM/prera-ldst-insertpt.mir create mode 100644 test/CodeGen/ARM/select_const.ll create mode 100644 test/CodeGen/ARM/setcc-logic.ll delete mode 100644 test/CodeGen/ARM/setcc-sentinals.ll create mode 100644 test/CodeGen/ARM/single-issue-r52.mir create mode 100644 test/CodeGen/ARM/sjljeh-swifterror.ll create mode 100644 test/CodeGen/ARM/thumb1-div.ll create mode 100644 test/CodeGen/ARM/unschedule-first-call.ll create mode 100644 test/CodeGen/ARM/v6-jumptable-clobber.mir create mode 100644 test/CodeGen/ARM/v8m-tail-call.ll create mode 100644 test/CodeGen/ARM/v8m.base-jumptable_alignment.ll create mode 100644 test/CodeGen/ARM/vcmp-crash.ll create mode 100644 test/CodeGen/ARM/vldm-liveness.mir create mode 100644 test/CodeGen/AVR/intrinsics/stacksave-restore.ll create mode 100644 test/CodeGen/AVR/no-print-operand-twice.ll create mode 100644 test/CodeGen/BPF/mem_offset.ll create mode 100644 test/CodeGen/BPF/warn-call.ll create mode 100644 test/CodeGen/BPF/warn-stack.ll create mode 100644 test/CodeGen/Generic/pr24662.ll create mode 100644 test/CodeGen/Hexagon/bit-bitsplit-at.ll create mode 100644 test/CodeGen/Hexagon/bit-bitsplit-src.ll create mode 100644 test/CodeGen/Hexagon/bit-bitsplit.ll create mode 100644 test/CodeGen/Hexagon/bit-ext-sat.ll create mode 100644 test/CodeGen/Hexagon/bit-extract-off.ll create mode 100644 test/CodeGen/Hexagon/bit-extract.ll create mode 100644 test/CodeGen/Hexagon/bit-has.ll create mode 100644 test/CodeGen/Hexagon/bitmanip.ll create mode 100644 test/CodeGen/Hexagon/builtin-expect.ll create mode 100644 test/CodeGen/Hexagon/convert-to-dot-old.ll delete mode 100644 test/CodeGen/Hexagon/ctlz-cttz-ctpop.ll create mode 100644 test/CodeGen/Hexagon/early-if-merge-loop.ll create mode 100644 test/CodeGen/Hexagon/early-if-vecpred.ll create mode 100644 test/CodeGen/Hexagon/expand-condsets-dead-bad.ll create mode 100644 test/CodeGen/Hexagon/expand-condsets-dead-pred.ll create mode 100644 test/CodeGen/Hexagon/expand-vstorerw-undef2.ll create mode 100644 test/CodeGen/Hexagon/find-loop-instr.ll create mode 100644 test/CodeGen/Hexagon/ifcvt-simple-bprob.ll create mode 100644 test/CodeGen/Hexagon/inline-asm-vecpred128.ll create mode 100644 test/CodeGen/Hexagon/intrinsics/byte-store-double.ll create mode 100644 test/CodeGen/Hexagon/intrinsics/byte-store.ll create mode 100644 test/CodeGen/Hexagon/isel-exti1.ll create mode 100644 test/CodeGen/Hexagon/isel-i1arg-crash.ll create mode 100644 test/CodeGen/Hexagon/isel-op-zext-i1.ll create mode 100644 test/CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll create mode 100644 test/CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll create mode 100644 test/CodeGen/Hexagon/loop-idiom/lcssa.ll create mode 100644 test/CodeGen/Hexagon/loop-idiom/nullptr-crash.ll create mode 100644 test/CodeGen/Hexagon/loop-idiom/pmpy-infinite-loop.ll create mode 100644 test/CodeGen/Hexagon/loop-idiom/pmpy-mod.ll create mode 100644 test/CodeGen/Hexagon/loop-idiom/pmpy.ll create mode 100644 test/CodeGen/Hexagon/newvaluejump3.ll create mode 100644 test/CodeGen/Hexagon/readcyclecounter.ll create mode 100644 test/CodeGen/Hexagon/regalloc-block-overlap.ll create mode 100644 test/CodeGen/Hexagon/swp-stages4.ll create mode 100644 test/CodeGen/Hexagon/swp-stages5.ll create mode 100644 test/CodeGen/Hexagon/undo-dag-shift.ll create mode 100644 test/CodeGen/MIR/AArch64/atomic-memoperands.mir create mode 100644 test/CodeGen/MIR/AArch64/register-operand-bank.mir delete mode 100644 test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir create mode 100644 test/CodeGen/MIR/Generic/machine-function-missing-body.mir create mode 100644 test/CodeGen/MIR/X86/dynamic-regmask.ll create mode 100644 test/CodeGen/MIR/X86/register-operand-class-invalid0.mir create mode 100644 test/CodeGen/MIR/X86/register-operand-class-invalid1.mir create mode 100644 test/CodeGen/MIR/X86/register-operand-class.mir delete mode 100644 test/CodeGen/MIR/X86/used-physical-register-info.mir create mode 100644 test/CodeGen/MSP430/struct-return.ll create mode 100644 test/CodeGen/Mips/cins.ll create mode 100644 test/CodeGen/Mips/compactbranches/empty-block.mir create mode 100644 test/CodeGen/Mips/dext.ll create mode 100644 test/CodeGen/Mips/msa/bmzi_bmnzi.ll create mode 100644 test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll create mode 100644 test/CodeGen/Mips/xray-section-group.ll create mode 100644 test/CodeGen/NVPTX/f16-instructions.ll create mode 100644 test/CodeGen/NVPTX/f16x2-instructions.ll create mode 100644 test/CodeGen/NVPTX/fcos-no-fast-math.ll create mode 100644 test/CodeGen/NVPTX/fsin-no-fast-math.ll create mode 100644 test/CodeGen/NVPTX/global-variable-big.ll create mode 100644 test/CodeGen/NVPTX/idioms.ll create mode 100644 test/CodeGen/NVPTX/named-barriers.ll create mode 100644 test/CodeGen/NVPTX/param-load-store.ll delete mode 100644 test/CodeGen/NVPTX/rsqrt.ll create mode 100644 test/CodeGen/NVPTX/sqrt-approx.ll create mode 100644 test/CodeGen/PowerPC/addegluecrash.ll create mode 100644 test/CodeGen/PowerPC/atomics-regression.ll create mode 100644 test/CodeGen/PowerPC/branch_coalesce.ll create mode 100644 test/CodeGen/PowerPC/ctrloop-i128.ll create mode 100644 test/CodeGen/PowerPC/expand-contiguous-isel.ll create mode 100644 test/CodeGen/PowerPC/expand-isel-1.mir create mode 100644 test/CodeGen/PowerPC/expand-isel-2.mir create mode 100644 test/CodeGen/PowerPC/expand-isel-3.mir create mode 100644 test/CodeGen/PowerPC/expand-isel-4.mir create mode 100644 test/CodeGen/PowerPC/expand-isel-5.mir create mode 100644 test/CodeGen/PowerPC/expand-isel-6.mir create mode 100644 test/CodeGen/PowerPC/expand-isel-7.mir create mode 100644 test/CodeGen/PowerPC/expand-isel-8.mir create mode 100644 test/CodeGen/PowerPC/expand-isel.ll create mode 100644 test/CodeGen/PowerPC/fma-aggr-FMF.ll create mode 100644 test/CodeGen/PowerPC/pr32063.ll create mode 100644 test/CodeGen/PowerPC/pr32140.ll create mode 100644 test/CodeGen/PowerPC/pristine-and-livein.mir create mode 100644 test/CodeGen/PowerPC/select_const.ll create mode 100644 test/CodeGen/PowerPC/setcc-logic.ll create mode 100644 test/CodeGen/PowerPC/sjlj_no0x.ll create mode 100644 test/CodeGen/PowerPC/stacksize.ll create mode 100644 test/CodeGen/PowerPC/subtract_from_imm.ll create mode 100644 test/CodeGen/PowerPC/tail-dup-break-cfg.ll create mode 100644 test/CodeGen/SPARC/register-clobber.ll create mode 100644 test/CodeGen/SystemZ/DAGCombine_trunc_extract.ll create mode 100644 test/CodeGen/SystemZ/DAGCombiner_illegal_BUILD_VECTOR.ll create mode 100644 test/CodeGen/SystemZ/expand-zext-pseudo.ll create mode 100644 test/CodeGen/SystemZ/extract-vector-elt-zEC12.ll create mode 100644 test/CodeGen/SystemZ/fold-memory-op-impl.ll create mode 100644 test/CodeGen/SystemZ/locr-legal-regclass.ll delete mode 100644 test/CodeGen/SystemZ/memchr-02.ll delete mode 100644 test/CodeGen/SystemZ/memcmp-02.ll create mode 100644 test/CodeGen/SystemZ/pr32372.ll create mode 100644 test/CodeGen/SystemZ/pr32505.ll create mode 100644 test/CodeGen/SystemZ/splitMove_undefReg_mverifier.ll delete mode 100644 test/CodeGen/SystemZ/strcmp-02.ll delete mode 100644 test/CodeGen/SystemZ/strlen-02.ll create mode 100644 test/CodeGen/SystemZ/undef-flag.ll create mode 100644 test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll create mode 100644 test/CodeGen/SystemZ/vec-cmpsel.ll create mode 100644 test/CodeGen/SystemZ/vec-sext.ll create mode 100644 test/CodeGen/SystemZ/vec-trunc-to-i1.ll create mode 100644 test/CodeGen/SystemZ/vec-zext.ll create mode 100644 test/CodeGen/SystemZ/vectorizer-output-3xi32.ll create mode 100644 test/CodeGen/Thumb/remove-unneeded-push-pop.ll create mode 100644 test/CodeGen/Thumb/stm-deprecated.ll create mode 100644 test/CodeGen/Thumb/tbb-reuse.mir create mode 100644 test/CodeGen/Thumb2/intrinsics-coprocessor.ll create mode 100644 test/CodeGen/Thumb2/tbb-removeadd.mir create mode 100644 test/CodeGen/WebAssembly/f16.ll create mode 100644 test/CodeGen/X86/DynamicCalleeSavedRegisters.ll create mode 100644 test/CodeGen/X86/GlobalISel/X86-regbankselect.mir create mode 100644 test/CodeGen/X86/GlobalISel/binop-isel.ll create mode 100644 test/CodeGen/X86/GlobalISel/constant.ll create mode 100644 test/CodeGen/X86/GlobalISel/frameIndex-instructionselect.mir create mode 100644 test/CodeGen/X86/GlobalISel/frameIndex.ll create mode 100644 test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll create mode 100644 test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll create mode 100644 test/CodeGen/X86/GlobalISel/legalize-add.mir create mode 100644 test/CodeGen/X86/GlobalISel/legalize-const.mir create mode 100644 test/CodeGen/X86/GlobalISel/legalize-sub.mir create mode 100644 test/CodeGen/X86/GlobalISel/memop-isel.ll create mode 100644 test/CodeGen/X86/GlobalISel/select-constant.mir create mode 100644 test/CodeGen/X86/GlobalISel/x86_64-instructionselect.mir create mode 100644 test/CodeGen/X86/absolute-cmp.ll create mode 100644 test/CodeGen/X86/and-sink.ll create mode 100644 test/CodeGen/X86/arg-copy-elide.ll create mode 100644 test/CodeGen/X86/avx-cvt-3.ll create mode 100644 test/CodeGen/X86/avx512-adc-sbb.ll create mode 100644 test/CodeGen/X86/avx512-insert-extract_i1.ll create mode 100644 test/CodeGen/X86/avx512-memfold.ll create mode 100644 test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll create mode 100644 test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll create mode 100644 test/CodeGen/X86/bool-ext-inc.ll create mode 100644 test/CodeGen/X86/branchfolding-debugloc.ll create mode 100644 test/CodeGen/X86/clflushopt.ll create mode 100644 test/CodeGen/X86/clzero.ll delete mode 100644 test/CodeGen/X86/code_placement_outline_optional_branches.ll create mode 100644 test/CodeGen/X86/combine-abs.ll delete mode 100644 test/CodeGen/X86/combiner-aa-0.ll delete mode 100644 test/CodeGen/X86/combiner-aa-1.ll create mode 100644 test/CodeGen/X86/commute-3dnow.ll create mode 100644 test/CodeGen/X86/complex-fastmath.ll create mode 100644 test/CodeGen/X86/conditional-tailcall.ll create mode 100644 test/CodeGen/X86/div-rem-simplify.ll create mode 100644 test/CodeGen/X86/dropped_constructor.ll create mode 100644 test/CodeGen/X86/dwarf-headers.ll create mode 100644 test/CodeGen/X86/elf-associated.ll create mode 100644 test/CodeGen/X86/fast-isel-abort-warm.ll create mode 100644 test/CodeGen/X86/fentry-insertion.ll create mode 100644 test/CodeGen/X86/file-source-filename.ll create mode 100644 test/CodeGen/X86/fp-intrinsics.ll create mode 100644 test/CodeGen/X86/huge-stack-offset2.ll create mode 100644 test/CodeGen/X86/illegal-bitfield-loadstore.ll create mode 100644 test/CodeGen/X86/inline-asm-A-constraint.ll create mode 100644 test/CodeGen/X86/known-signbits-vector.ll create mode 100644 test/CodeGen/X86/lea-opt-with-debug.mir delete mode 100644 test/CodeGen/X86/lfence.ll create mode 100644 test/CodeGen/X86/load-combine.ll create mode 100644 test/CodeGen/X86/machine-outliner-debuginfo.ll create mode 100644 test/CodeGen/X86/machine-outliner-tailcalls.ll create mode 100644 test/CodeGen/X86/machine-outliner.ll create mode 100644 test/CodeGen/X86/machine-region-info.mir create mode 100644 test/CodeGen/X86/madd.ll create mode 100644 test/CodeGen/X86/mempcpy-32.ll create mode 100644 test/CodeGen/X86/merge_store_duplicated_loads.ll create mode 100644 test/CodeGen/X86/mmx-cvt.ll create mode 100644 test/CodeGen/X86/mul-constant-i16.ll create mode 100644 test/CodeGen/X86/mul-constant-i32.ll create mode 100644 test/CodeGen/X86/mul-constant-i64.ll create mode 100644 test/CodeGen/X86/overflow.ll delete mode 100644 test/CodeGen/X86/pr18023.ll create mode 100644 test/CodeGen/X86/pr18344.ll create mode 100644 test/CodeGen/X86/pr22338.ll create mode 100644 test/CodeGen/X86/pr30562.ll delete mode 100644 test/CodeGen/X86/pr30693.ll create mode 100644 test/CodeGen/X86/pr31773.ll create mode 100644 test/CodeGen/X86/pr32108.ll create mode 100644 test/CodeGen/X86/pr32241.ll create mode 100644 test/CodeGen/X86/pr32256.ll create mode 100644 test/CodeGen/X86/pr32278.ll create mode 100644 test/CodeGen/X86/pr32284.ll create mode 100644 test/CodeGen/X86/pr32329.ll create mode 100644 test/CodeGen/X86/pr32340.ll create mode 100644 test/CodeGen/X86/pr32345.ll create mode 100644 test/CodeGen/X86/pr32420.ll create mode 100644 test/CodeGen/X86/pr32451.ll create mode 100644 test/CodeGen/X86/pr32484.ll create mode 100644 test/CodeGen/X86/pr32588.ll create mode 100644 test/CodeGen/X86/pre-coalesce-2.ll create mode 100644 test/CodeGen/X86/pre-coalesce.ll create mode 100644 test/CodeGen/X86/pre-coalesce.mir create mode 100644 test/CodeGen/X86/recip-pic.ll create mode 100644 test/CodeGen/X86/regparm.ll create mode 100644 test/CodeGen/X86/sad_variations.ll create mode 100644 test/CodeGen/X86/selectiondag-order.ll create mode 100644 test/CodeGen/X86/setcc-logic.ll delete mode 100644 test/CodeGen/X86/setcc-sentinals.ll create mode 100644 test/CodeGen/X86/setcc-wide-types.ll delete mode 100644 test/CodeGen/X86/sfence.ll create mode 100644 test/CodeGen/X86/shuffle-combine-crash-2.ll create mode 100644 test/CodeGen/X86/shuffle-of-splat-multiuses.ll create mode 100644 test/CodeGen/X86/split-extend-vector-inreg.ll create mode 100644 test/CodeGen/X86/sse-intrinsics-x86_64.ll create mode 100644 test/CodeGen/X86/sse2-intrinsics-x86_64.ll create mode 100644 test/CodeGen/X86/sse42-intrinsics-x86_64.ll delete mode 100644 test/CodeGen/X86/sse42.ll delete mode 100644 test/CodeGen/X86/sse42_64.ll create mode 100644 test/CodeGen/X86/stack-folding-bmi.ll create mode 100644 test/CodeGen/X86/stack-folding-bmi2.ll create mode 100644 test/CodeGen/X86/stack-folding-sha.ll create mode 100644 test/CodeGen/X86/stack-folding-tbm.ll create mode 100644 test/CodeGen/X86/stack-protector-remarks.ll create mode 100644 test/CodeGen/X86/tail-call-conditional.mir create mode 100644 test/CodeGen/X86/tail-dup-debugloc.ll create mode 100644 test/CodeGen/X86/tail-dup-no-other-successor.ll create mode 100644 test/CodeGen/X86/tail-merge-debugloc.ll create mode 100644 test/CodeGen/X86/tail-merge-identical.ll create mode 100644 test/CodeGen/X86/update-terminator-debugloc.ll create mode 100644 test/CodeGen/X86/vector-compare-all_of.ll create mode 100644 test/CodeGen/X86/vector-compare-any_of.ll create mode 100644 test/CodeGen/X86/vector-extend-inreg.ll create mode 100644 test/CodeGen/X86/vector-shuffle-combining-sse41.ll create mode 100644 test/CodeGen/X86/vselect-pcmp.ll create mode 100644 test/CodeGen/X86/win64_eh_leaf2.ll create mode 100644 test/CodeGen/X86/x86-64-intrcc-nosse.ll create mode 100644 test/CodeGen/X86/xop-ifma.ll create mode 100644 test/CodeGen/X86/xor-combine-debugloc.ll create mode 100644 test/CodeGen/X86/xray-log-args.ll create mode 100644 test/CodeGen/XCore/section-name.ll (limited to 'test/CodeGen') diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll b/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll new file mode 100644 index 000000000000..a70cee0efcb6 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/arm64-callingconv-ios.ll @@ -0,0 +1,28 @@ +; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-apple-ios9.0" + +; CHECK-LABEL: name: test_varargs +; CHECK: [[ANSWER:%[0-9]+]](s32) = G_CONSTANT i32 42 +; CHECK: [[D_ONE:%[0-9]+]](s64) = G_FCONSTANT double 1.000000e+00 +; CHECK: [[TWELVE:%[0-9]+]](s64) = G_CONSTANT i64 12 +; CHECK: [[THREE:%[0-9]+]](s8) = G_CONSTANT i8 3 +; CHECK: [[ONE:%[0-9]+]](s16) = G_CONSTANT i16 1 +; CHECK: [[FOUR:%[0-9]+]](s32) = G_CONSTANT i32 4 +; CHECK: [[F_ONE:%[0-9]+]](s32) = G_FCONSTANT float 1.000000e+00 +; CHECK: [[TWO:%[0-9]+]](s64) = G_FCONSTANT double 2.000000e+00 + +; CHECK: %w0 = COPY [[ANSWER]] +; CHECK: %d0 = COPY [[D_ONE]] +; CHECK: %x1 = COPY [[TWELVE]] +; CHECK: G_STORE [[THREE]](s8), {{%[0-9]+}}(p0) :: (store 1 into stack, align 0) +; CHECK: G_STORE [[ONE]](s16), {{%[0-9]+}}(p0) :: (store 2 into stack + 8, align 0) +; CHECK: G_STORE [[FOUR]](s32), {{%[0-9]+}}(p0) :: (store 4 into stack + 16, align 0) +; CHECK: G_STORE [[F_ONE]](s32), {{%[0-9]+}}(p0) :: (store 4 into stack + 24, align 0) +; CHECK: G_STORE [[TWO]](s64), {{%[0-9]+}}(p0) :: (store 8 into stack + 32, align 0) +declare void @varargs(i32, double, i64, ...) +define void @test_varargs() { + call void(i32, double, i64, ...) @varargs(i32 42, double 1.0, i64 12, i8 3, i16 1, i32 4, float 1.0, double 2.0) + ret void +} diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll b/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll index 95b2ea2b4ffc..59b9bb49f0ee 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll @@ -56,3 +56,41 @@ define i8* @args_ptrs(i8* %x0, i16* %x1, <2 x i8>* %x2, {i8, i16, i32}* %x3, define [1 x double] @args_arr([1 x double] %d0) { ret [1 x double] %d0 } + +; CHECK-LABEL: name: test_varargs +; CHECK: [[ANSWER:%[0-9]+]](s32) = G_CONSTANT i32 42 +; CHECK: [[D_ONE:%[0-9]+]](s64) = G_FCONSTANT double 1.000000e+00 +; CHECK: [[TWELVE:%[0-9]+]](s64) = G_CONSTANT i64 12 +; CHECK: [[THREE:%[0-9]+]](s8) = G_CONSTANT i8 3 +; CHECK: [[ONE:%[0-9]+]](s16) = G_CONSTANT i16 1 +; CHECK: [[FOUR:%[0-9]+]](s32) = G_CONSTANT i32 4 +; CHECK: [[F_ONE:%[0-9]+]](s32) = G_FCONSTANT float 1.000000e+00 +; CHECK: [[TWO:%[0-9]+]](s64) = G_FCONSTANT double 2.000000e+00 + +; CHECK: %w0 = COPY [[ANSWER]] +; CHECK: %d0 = COPY [[D_ONE]] +; CHECK: %x1 = COPY [[TWELVE]] +; CHECK: %w2 = COPY [[THREE]](s8) +; CHECK: %w3 = COPY [[ONE]](s16) +; CHECK: %w4 = COPY [[FOUR]](s32) +; CHECK: %s1 = COPY [[F_ONE]](s32) +; CHECK: %d2 = COPY [[TWO]](s64) +declare void @varargs(i32, double, i64, ...) +define void @test_varargs() { + call void(i32, double, i64, ...) @varargs(i32 42, double 1.0, i64 12, i8 3, i16 1, i32 4, float 1.0, double 2.0) + ret void +} + +; signext/zeroext parameters on the stack: not part of any real ABI as far as I +; know, but ELF currently allocates 8 bytes for a signext parameter on the +; stack. The ADJCALLSTACK ops should reflect this, even if the difference is +; theoretical. +declare void @stack_ext_needed([8 x i64], i8 signext %in) +; CHECK-LABEL: name: test_stack_ext_needed +; CHECK: ADJCALLSTACKDOWN 8 +; CHECK: BL @stack_ext_needed +; CHECK: ADJCALLSTACKUP 8 +define void @test_stack_ext_needed() { + call void @stack_ext_needed([8 x i64] undef, i8 signext 42) + ret void +} diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 8d1dbc246e6a..e40199d82c9d 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -1,6 +1,6 @@ ; RUN: not llc -O0 -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR ; RUN: llc -O0 -global-isel -global-isel-abort=0 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=FALLBACK -; RUN: llc -O0 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o %t.out 2> %t.err +; RUN: llc -O0 -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o %t.out 2> %t.err ; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-OUT < %t.out ; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-ERR < %t.err ; This file checks that the fallback path to selection dag works. @@ -14,10 +14,11 @@ target triple = "aarch64--" ; We use __fixunstfti as the common denominator for __fixunstfti on Linux and ; ___fixunstfti on iOS -; ERROR: Unable to lower arguments +; ERROR: unable to lower arguments: i128 (i128)* (in function: ABIi128) ; FALLBACK: ldr q0, ; FALLBACK-NEXT: bl __fixunstfti ; +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to lower arguments: i128 (i128)* (in function: ABIi128) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for ABIi128 ; FALLBACK-WITH-REPORT-OUT-LABEL: ABIi128: ; FALLBACK-WITH-REPORT-OUT: ldr q0, @@ -31,6 +32,7 @@ define i128 @ABIi128(i128 %arg1) { ; It happens that we don't handle ConstantArray instances yet during ; translation. Any other constant would be fine too. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to translate constant: [1 x double] (in function: constant) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for constant ; FALLBACK-WITH-REPORT-OUT-LABEL: constant: ; FALLBACK-WITH-REPORT-OUT: fmov d0, #1.0 @@ -41,6 +43,7 @@ define [1 x double] @constant() { ; The key problem here is that we may fail to create an MBB referenced by a ; PHI. If so, we cannot complete the G_PHI and mustn't try or bad things ; happen. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: G_STORE %vreg4, %vreg2; mem:ST4[%addr] GPR:%vreg4,%vreg2 (in function: pending_phis) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for pending_phis ; FALLBACK-WITH-REPORT-OUT-LABEL: pending_phis: define i32 @pending_phis(i1 %tst, i32 %val, i32* %addr) { @@ -60,6 +63,7 @@ false: } ; General legalizer inability to handle types whose size wasn't a power of 2. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %vreg1(s42) = G_LOAD %vreg0; mem:LD6[%addr](align=8) (in function: odd_type) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type ; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type: define void @odd_type(i42* %addr) { @@ -67,8 +71,17 @@ define void @odd_type(i42* %addr) { ret void } +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %vreg1(<7 x s32>) = G_LOAD %vreg0; mem:LD28[%addr](align=32) (in function: odd_vector) +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_vector +; FALLBACK-WITH-REPORT-OUT-LABEL: odd_vector: +define void @odd_vector(<7 x i32>* %addr) { + %vec = load <7 x i32>, <7 x i32>* %addr + ret void +} + ; RegBankSelect crashed when given invalid mappings, and AArch64's ; implementation produce valid-but-nonsense mappings for G_SEQUENCE. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to map instruction ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for sequence_mapping ; FALLBACK-WITH-REPORT-OUT-LABEL: sequence_mapping: define void @sequence_mapping([2 x i64] %in) { @@ -76,42 +89,68 @@ define void @sequence_mapping([2 x i64] %in) { } ; Legalizer was asserting when it enountered an unexpected default action. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to map instruction ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for legal_default ; FALLBACK-WITH-REPORT-LABEL: legal_default: -define void @legal_default(i64 %in) { - insertvalue [2 x i64] undef, i64 %in, 0 +define void @legal_default([8 x i8] %in) { + insertvalue { [4 x i8], [8 x i8], [4 x i8] } undef, [8 x i8] %in, 1 ret void } -; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for debug_insts -; FALLBACK-WITH-REPORT-LABEL: debug_insts: -define void @debug_insts(i32 %in) #0 !dbg !7 { -entry: - %in.addr = alloca i32, align 4 - store i32 %in, i32* %in.addr, align 4 - call void @llvm.dbg.declare(metadata i32* %in.addr, metadata !11, metadata !12), !dbg !13 - ret void, !dbg !14 + ; AArch64 was asserting instead of returning an invalid mapping for unknown + ; sizes. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to translate instruction: ret: ' ret i128 undef' (in function: sequence_sizes) +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for sequence_sizes +; FALLBACK-WITH-REPORT-LABEL: sequence_sizes: +define i128 @sequence_sizes([8 x i8] %in) { + ret i128 undef +} + +; Just to make sure we don't accidentally emit a normal load/store. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: %vreg2(s64) = G_LOAD %vreg0; mem:LD8[%addr] GPR:%vreg2,%vreg0 (in function: atomic_ops) +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for atomic_ops +; FALLBACK-WITH-REPORT-LABEL: atomic_ops: +define i64 @atomic_ops(i64* %addr) { + store atomic i64 0, i64* %addr unordered, align 8 + %res = load atomic i64, i64* %addr seq_cst, align 8 + ret i64 %res +} + +; Make sure we don't mess up metadata arguments. +declare void @llvm.write_register.i64(metadata, i64) + +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to translate instruction: call: ' call void @llvm.write_register.i64(metadata !0, i64 0)' (in function: test_write_register_intrin) +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_write_register_intrin +; FALLBACK-WITH-REPORT-LABEL: test_write_register_intrin: +define void @test_write_register_intrin() { + call void @llvm.write_register.i64(metadata !{!"sp"}, i64 0) + ret void } -; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 289075) (llvm/trunk 289080)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) -!1 = !DIFile(filename: "tmp.c", directory: "/Users/tim/llvm/build") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"PIC Level", i32 2} -!6 = !{!"clang version 4.0.0 (trunk 289075) (llvm/trunk 289080)"} -!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) -!8 = !DISubroutineType(types: !9) -!9 = !{null, !10} -!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!11 = !DILocalVariable(name: "in", arg: 1, scope: !7, file: !1, line: 1, type: !10) -!12 = !DIExpression() -!13 = !DILocation(line: 1, column: 14, scope: !7) -!14 = !DILocation(line: 2, column: 1, scope: !7) +@_ZTIi = external global i8* +declare i32 @__gxx_personality_v0(...) + +; Check that we fallback on invoke translation failures. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to translate instruction: invoke: ' invoke void %callee(i128 0) +; FALLBACK-WITH-REPORT-NEXT: to label %continue unwind label %broken' (in function: invoke_weird_type) +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for invoke_weird_type +; FALLBACK-WITH-REPORT-OUT-LABEL: invoke_weird_type: +define void @invoke_weird_type(void(i128)* %callee) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + invoke void %callee(i128 0) + to label %continue unwind label %broken + +broken: + landingpad { i8*, i32 } catch i8* bitcast(i8** @_ZTIi to i8*) + ret void + +continue: + ret void +} + +; Check that we fallback on invoke translation failures. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %vreg0(s128) = G_FCONSTANT quad 2 +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_quad_dump +; FALLBACK-WITH-REPORT-OUT-LABEL: test_quad_dump: +define fp128 @test_quad_dump() { + ret fp128 0xL00000000000000004000000000000000 +} diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir deleted file mode 100644 index ece5a858b49c..000000000000 --- a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir +++ /dev/null @@ -1,2979 +0,0 @@ -# RUN: llc -O0 -mtriple=aarch64-apple-ios -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=IOS -# RUN: llc -O0 -mtriple=aarch64-linux-gnu -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LINUX-DEFAULT -# RUN: llc -O0 -mtriple=aarch64-linux-gnu -relocation-model=pic -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LINUX-PIC - -# Test the instruction selector. -# As we support more instructions, we need to split this up. - ---- | - target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" - - define void @add_s8_gpr() { ret void } - define void @add_s16_gpr() { ret void } - define void @add_s32_gpr() { ret void } - define void @add_s64_gpr() { ret void } - - define void @sub_s8_gpr() { ret void } - define void @sub_s16_gpr() { ret void } - define void @sub_s32_gpr() { ret void } - define void @sub_s64_gpr() { ret void } - - define void @or_s1_gpr() { ret void } - define void @or_s16_gpr() { ret void } - define void @or_s32_gpr() { ret void } - define void @or_s64_gpr() { ret void } - define void @or_v2s32_fpr() { ret void } - - define void @xor_s8_gpr() { ret void } - define void @xor_s16_gpr() { ret void } - define void @xor_s32_gpr() { ret void } - define void @xor_s64_gpr() { ret void } - - define void @and_s8_gpr() { ret void } - define void @and_s16_gpr() { ret void } - define void @and_s32_gpr() { ret void } - define void @and_s64_gpr() { ret void } - - define void @shl_s8_gpr() { ret void } - define void @shl_s16_gpr() { ret void } - define void @shl_s32_gpr() { ret void } - define void @shl_s64_gpr() { ret void } - - define void @lshr_s32_gpr() { ret void } - define void @lshr_s64_gpr() { ret void } - - define void @ashr_s32_gpr() { ret void } - define void @ashr_s64_gpr() { ret void } - - define void @mul_s8_gpr() { ret void } - define void @mul_s16_gpr() { ret void } - define void @mul_s32_gpr() { ret void } - define void @mul_s64_gpr() { ret void } - - define void @sdiv_s32_gpr() { ret void } - define void @sdiv_s64_gpr() { ret void } - - define void @udiv_s32_gpr() { ret void } - define void @udiv_s64_gpr() { ret void } - - define void @fadd_s32_gpr() { ret void } - define void @fadd_s64_gpr() { ret void } - - define void @fsub_s32_gpr() { ret void } - define void @fsub_s64_gpr() { ret void } - - define void @fmul_s32_gpr() { ret void } - define void @fmul_s64_gpr() { ret void } - - define void @fdiv_s32_gpr() { ret void } - define void @fdiv_s64_gpr() { ret void } - - define void @sitofp_s32_s32_fpr() { ret void } - define void @sitofp_s32_s64_fpr() { ret void } - define void @sitofp_s64_s32_fpr() { ret void } - define void @sitofp_s64_s64_fpr() { ret void } - - define void @uitofp_s32_s32_fpr() { ret void } - define void @uitofp_s32_s64_fpr() { ret void } - define void @uitofp_s64_s32_fpr() { ret void } - define void @uitofp_s64_s64_fpr() { ret void } - - define void @fptosi_s32_s32_gpr() { ret void } - define void @fptosi_s32_s64_gpr() { ret void } - define void @fptosi_s64_s32_gpr() { ret void } - define void @fptosi_s64_s64_gpr() { ret void } - - define void @fptoui_s32_s32_gpr() { ret void } - define void @fptoui_s32_s64_gpr() { ret void } - define void @fptoui_s64_s32_gpr() { ret void } - define void @fptoui_s64_s64_gpr() { ret void } - - define void @fptrunc() { ret void } - define void @fpext() { ret void } - - define void @unconditional_br() { ret void } - define void @conditional_br() { ret void } - - define void @load_s64_gpr(i64* %addr) { ret void } - define void @load_s32_gpr(i32* %addr) { ret void } - define void @load_s16_gpr(i16* %addr) { ret void } - define void @load_s8_gpr(i8* %addr) { ret void } - define void @load_s64_fpr(i64* %addr) { ret void } - define void @load_s32_fpr(i32* %addr) { ret void } - define void @load_s16_fpr(i16* %addr) { ret void } - define void @load_s8_fpr(i8* %addr) { ret void } - - define void @store_s64_gpr(i64* %addr) { ret void } - define void @store_s32_gpr(i32* %addr) { ret void } - define void @store_s16_gpr(i16* %addr) { ret void } - define void @store_s8_gpr(i8* %addr) { ret void } - define void @store_s64_fpr(i64* %addr) { ret void } - define void @store_s32_fpr(i32* %addr) { ret void } - - define void @frame_index() { - %ptr0 = alloca i64 - ret void - } - - define void @selected_property() { ret void } - - define i32 @const_s32() { ret i32 42 } - define i64 @const_s64() { ret i64 1234567890123 } - - define i32 @fconst_s32() { ret i32 42 } - define i64 @fconst_s64() { ret i64 1234567890123 } - - define i8* @gep(i8* %in) { ret i8* undef } - - @var_local = global i8 0 - define i8* @global_local() { ret i8* undef } - - @var_got = external global i8 - define i8* @global_got() { ret i8* undef } - - define void @trunc() { ret void } - - define void @anyext_gpr() { ret void } - define void @zext_gpr() { ret void } - define void @sext_gpr() { ret void } - - define void @casts() { ret void } - - define void @bitcast_s32_gpr() { ret void } - define void @bitcast_s32_fpr() { ret void } - define void @bitcast_s32_gpr_fpr() { ret void } - define void @bitcast_s32_fpr_gpr() { ret void } - define void @bitcast_s64_gpr() { ret void } - define void @bitcast_s64_fpr() { ret void } - define void @bitcast_s64_gpr_fpr() { ret void } - define void @bitcast_s64_fpr_gpr() { ret void } - - define void @icmp() { ret void } - define void @fcmp() { ret void } - - define void @phi() { ret void } - - define void @select() { ret void } -... - ---- -# CHECK-LABEL: name: add_s8_gpr -name: add_s8_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ADDWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s8) = COPY %w0 - %1(s8) = COPY %w1 - %2(s8) = G_ADD %0, %1 -... - ---- -# CHECK-LABEL: name: add_s16_gpr -name: add_s16_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ADDWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s16) = COPY %w0 - %1(s16) = COPY %w1 - %2(s16) = G_ADD %0, %1 -... - ---- -# Check that we select a 32-bit GPR G_ADD into ADDWrr on GPR32. -# Also check that we constrain the register class of the COPY to GPR32. -# CHECK-LABEL: name: add_s32_gpr -name: add_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ADDWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_ADD %0, %1 -... - ---- -# Same as add_s32_gpr, for 64-bit operations. -# CHECK-LABEL: name: add_s64_gpr -name: add_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = ADDXrr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_ADD %0, %1 -... - ---- -# CHECK-LABEL: name: sub_s8_gpr -name: sub_s8_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = SUBWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s8) = COPY %w0 - %1(s8) = COPY %w1 - %2(s8) = G_SUB %0, %1 -... - ---- -# CHECK-LABEL: name: sub_s16_gpr -name: sub_s16_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = SUBWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s16) = COPY %w0 - %1(s16) = COPY %w1 - %2(s16) = G_SUB %0, %1 -... - ---- -# Same as add_s32_gpr, for G_SUB operations. -# CHECK-LABEL: name: sub_s32_gpr -name: sub_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = SUBWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_SUB %0, %1 -... - ---- -# Same as add_s64_gpr, for G_SUB operations. -# CHECK-LABEL: name: sub_s64_gpr -name: sub_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = SUBXrr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_SUB %0, %1 -... - ---- -# CHECK-LABEL: name: or_s1_gpr -name: or_s1_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ORRWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s1) = COPY %w0 - %1(s1) = COPY %w1 - %2(s1) = G_OR %0, %1 -... - ---- -# CHECK-LABEL: name: or_s16_gpr -name: or_s16_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ORRWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s16) = COPY %w0 - %1(s16) = COPY %w1 - %2(s16) = G_OR %0, %1 -... - ---- -# Same as add_s32_gpr, for G_OR operations. -# CHECK-LABEL: name: or_s32_gpr -name: or_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ORRWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_OR %0, %1 -... - ---- -# Same as add_s64_gpr, for G_OR operations. -# CHECK-LABEL: name: or_s64_gpr -name: or_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = ORRXrr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_OR %0, %1 -... - ---- -# 64-bit G_OR on vector registers. -# CHECK-LABEL: name: or_v2s32_fpr -name: or_v2s32_fpr -legalized: true -regBankSelected: true -# -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -# CHECK-NEXT: - { id: 2, class: fpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = COPY %d1 -# The actual OR does not matter as long as it is operating -# on 64-bit width vector. -# CHECK: %2 = ORRv8i8 %0, %1 -body: | - bb.0: - liveins: %d0, %d1 - - %0(<2 x s32>) = COPY %d0 - %1(<2 x s32>) = COPY %d1 - %2(<2 x s32>) = G_OR %0, %1 -... - ---- -# CHECK-LABEL: name: xor_s8_gpr -name: xor_s8_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = EORWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s8) = COPY %w0 - %1(s8) = COPY %w1 - %2(s8) = G_XOR %0, %1 -... - ---- -# CHECK-LABEL: name: xor_s16_gpr -name: xor_s16_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = EORWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s16) = COPY %w0 - %1(s16) = COPY %w1 - %2(s16) = G_XOR %0, %1 -... - ---- -# Same as add_s32_gpr, for G_XOR operations. -# CHECK-LABEL: name: xor_s32_gpr -name: xor_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = EORWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_XOR %0, %1 -... - ---- -# Same as add_s64_gpr, for G_XOR operations. -# CHECK-LABEL: name: xor_s64_gpr -name: xor_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = EORXrr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_XOR %0, %1 -... - ---- -# CHECK-LABEL: name: and_s8_gpr -name: and_s8_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ANDWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s8) = COPY %w0 - %1(s8) = COPY %w1 - %2(s8) = G_AND %0, %1 -... - ---- -# CHECK-LABEL: name: and_s16_gpr -name: and_s16_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ANDWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s16) = COPY %w0 - %1(s16) = COPY %w1 - %2(s16) = G_AND %0, %1 -... - ---- -# Same as add_s32_gpr, for G_AND operations. -# CHECK-LABEL: name: and_s32_gpr -name: and_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ANDWrr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_AND %0, %1 -... - ---- -# Same as add_s64_gpr, for G_AND operations. -# CHECK-LABEL: name: and_s64_gpr -name: and_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = ANDXrr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_AND %0, %1 -... - ---- -# CHECK-LABEL: name: shl_s8_gpr -name: shl_s8_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = LSLVWr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s8) = COPY %w0 - %1(s8) = COPY %w1 - %2(s8) = G_SHL %0, %1 -... - ---- -# CHECK-LABEL: name: shl_s16_gpr -name: shl_s16_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = LSLVWr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s16) = COPY %w0 - %1(s16) = COPY %w1 - %2(s16) = G_SHL %0, %1 -... - ---- -# Same as add_s32_gpr, for G_SHL operations. -# CHECK-LABEL: name: shl_s32_gpr -name: shl_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = LSLVWr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_SHL %0, %1 -... - ---- -# Same as add_s64_gpr, for G_SHL operations. -# CHECK-LABEL: name: shl_s64_gpr -name: shl_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = LSLVXr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_SHL %0, %1 -... - ---- -# Same as add_s32_gpr, for G_LSHR operations. -# CHECK-LABEL: name: lshr_s32_gpr -name: lshr_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = LSRVWr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_LSHR %0, %1 -... - ---- -# Same as add_s64_gpr, for G_LSHR operations. -# CHECK-LABEL: name: lshr_s64_gpr -name: lshr_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = LSRVXr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_LSHR %0, %1 -... - ---- -# Same as add_s32_gpr, for G_ASHR operations. -# CHECK-LABEL: name: ashr_s32_gpr -name: ashr_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = ASRVWr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_ASHR %0, %1 -... - ---- -# Same as add_s64_gpr, for G_ASHR operations. -# CHECK-LABEL: name: ashr_s64_gpr -name: ashr_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = ASRVXr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_ASHR %0, %1 -... - ---- -# CHECK-LABEL: name: mul_s8_gpr -name: mul_s8_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = MADDWrrr %0, %1, %wzr -body: | - bb.0: - liveins: %w0, %w1 - - %0(s8) = COPY %w0 - %1(s8) = COPY %w1 - %2(s8) = G_MUL %0, %1 -... - ---- -# CHECK-LABEL: name: mul_s16_gpr -name: mul_s16_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = MADDWrrr %0, %1, %wzr -body: | - bb.0: - liveins: %w0, %w1 - - %0(s16) = COPY %w0 - %1(s16) = COPY %w1 - %2(s16) = G_MUL %0, %1 -... - ---- -# Check that we select s32 GPR G_MUL. This is trickier than other binops because -# there is only MADDWrrr, and we have to use the WZR physreg. -# CHECK-LABEL: name: mul_s32_gpr -name: mul_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = MADDWrrr %0, %1, %wzr -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_MUL %0, %1 -... - ---- -# Same as mul_s32_gpr for the s64 type. -# CHECK-LABEL: name: mul_s64_gpr -name: mul_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = MADDXrrr %0, %1, %xzr -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_MUL %0, %1 -... - ---- -# Same as add_s32_gpr, for G_SDIV operations. -# CHECK-LABEL: name: sdiv_s32_gpr -name: sdiv_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = SDIVWr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_SDIV %0, %1 -... - ---- -# Same as add_s64_gpr, for G_SDIV operations. -# CHECK-LABEL: name: sdiv_s64_gpr -name: sdiv_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = SDIVXr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_SDIV %0, %1 -... - ---- -# Same as add_s32_gpr, for G_UDIV operations. -# CHECK-LABEL: name: udiv_s32_gpr -name: udiv_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %w1 -# CHECK: %2 = UDIVWr %0, %1 -body: | - bb.0: - liveins: %w0, %w1 - - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s32) = G_UDIV %0, %1 -... - ---- -# Same as add_s64_gpr, for G_UDIV operations. -# CHECK-LABEL: name: udiv_s64_gpr -name: udiv_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: %2 = UDIVXr %0, %1 -body: | - bb.0: - liveins: %x0, %x1 - - %0(s64) = COPY %x0 - %1(s64) = COPY %x1 - %2(s64) = G_UDIV %0, %1 -... - ---- -# Check that we select a s32 FPR G_FADD into FADDSrr. -# CHECK-LABEL: name: fadd_s32_gpr -name: fadd_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: fpr32 } -# CHECK-NEXT: - { id: 2, class: fpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = COPY %s1 -# CHECK: %2 = FADDSrr %0, %1 -body: | - bb.0: - liveins: %s0, %s1 - - %0(s32) = COPY %s0 - %1(s32) = COPY %s1 - %2(s32) = G_FADD %0, %1 -... - ---- -# CHECK-LABEL: name: fadd_s64_gpr -name: fadd_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -# CHECK-NEXT: - { id: 2, class: fpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = COPY %d1 -# CHECK: %2 = FADDDrr %0, %1 -body: | - bb.0: - liveins: %d0, %d1 - - %0(s64) = COPY %d0 - %1(s64) = COPY %d1 - %2(s64) = G_FADD %0, %1 -... - ---- -# CHECK-LABEL: name: fsub_s32_gpr -name: fsub_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: fpr32 } -# CHECK-NEXT: - { id: 2, class: fpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = COPY %s1 -# CHECK: %2 = FSUBSrr %0, %1 -body: | - bb.0: - liveins: %s0, %s1 - - %0(s32) = COPY %s0 - %1(s32) = COPY %s1 - %2(s32) = G_FSUB %0, %1 -... - ---- -# CHECK-LABEL: name: fsub_s64_gpr -name: fsub_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -# CHECK-NEXT: - { id: 2, class: fpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = COPY %d1 -# CHECK: %2 = FSUBDrr %0, %1 -body: | - bb.0: - liveins: %d0, %d1 - - %0(s64) = COPY %d0 - %1(s64) = COPY %d1 - %2(s64) = G_FSUB %0, %1 -... - ---- -# CHECK-LABEL: name: fmul_s32_gpr -name: fmul_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: fpr32 } -# CHECK-NEXT: - { id: 2, class: fpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = COPY %s1 -# CHECK: %2 = FMULSrr %0, %1 -body: | - bb.0: - liveins: %s0, %s1 - - %0(s32) = COPY %s0 - %1(s32) = COPY %s1 - %2(s32) = G_FMUL %0, %1 -... - ---- -# CHECK-LABEL: name: fmul_s64_gpr -name: fmul_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -# CHECK-NEXT: - { id: 2, class: fpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = COPY %d1 -# CHECK: %2 = FMULDrr %0, %1 -body: | - bb.0: - liveins: %d0, %d1 - - %0(s64) = COPY %d0 - %1(s64) = COPY %d1 - %2(s64) = G_FMUL %0, %1 -... - ---- -# CHECK-LABEL: name: fdiv_s32_gpr -name: fdiv_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: fpr32 } -# CHECK-NEXT: - { id: 2, class: fpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = COPY %s1 -# CHECK: %2 = FDIVSrr %0, %1 -body: | - bb.0: - liveins: %s0, %s1 - - %0(s32) = COPY %s0 - %1(s32) = COPY %s1 - %2(s32) = G_FDIV %0, %1 -... - ---- -# CHECK-LABEL: name: fdiv_s64_gpr -name: fdiv_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -# CHECK-NEXT: - { id: 2, class: fpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = COPY %d1 -# CHECK: %2 = FDIVDrr %0, %1 -body: | - bb.0: - liveins: %d0, %d1 - - %0(s64) = COPY %d0 - %1(s64) = COPY %d1 - %2(s64) = G_FDIV %0, %1 -... - ---- -# CHECK-LABEL: name: sitofp_s32_s32_fpr -name: sitofp_s32_s32_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: fpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = SCVTFUWSri %0 -body: | - bb.0: - liveins: %w0 - - %0(s32) = COPY %w0 - %1(s32) = G_SITOFP %0 -... - ---- -# CHECK-LABEL: name: sitofp_s32_s64_fpr -name: sitofp_s32_s64_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: fpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = SCVTFUXSri %0 -body: | - bb.0: - liveins: %x0 - - %0(s64) = COPY %x0 - %1(s32) = G_SITOFP %0 -... - ---- -# CHECK-LABEL: name: sitofp_s64_s32_fpr -name: sitofp_s64_s32_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = SCVTFUWDri %0 -body: | - bb.0: - liveins: %w0 - - %0(s32) = COPY %w0 - %1(s64) = G_SITOFP %0 -... - ---- -# CHECK-LABEL: name: sitofp_s64_s64_fpr -name: sitofp_s64_s64_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = SCVTFUXDri %0 -body: | - bb.0: - liveins: %x0 - - %0(s64) = COPY %x0 - %1(s64) = G_SITOFP %0 -... - ---- -# CHECK-LABEL: name: uitofp_s32_s32_fpr -name: uitofp_s32_s32_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: fpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = UCVTFUWSri %0 -body: | - bb.0: - liveins: %w0 - - %0(s32) = COPY %w0 - %1(s32) = G_UITOFP %0 -... - ---- -# CHECK-LABEL: name: uitofp_s32_s64_fpr -name: uitofp_s32_s64_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: fpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = UCVTFUXSri %0 -body: | - bb.0: - liveins: %x0 - - %0(s64) = COPY %x0 - %1(s32) = G_UITOFP %0 -... - ---- -# CHECK-LABEL: name: uitofp_s64_s32_fpr -name: uitofp_s64_s32_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = UCVTFUWDri %0 -body: | - bb.0: - liveins: %w0 - - %0(s32) = COPY %w0 - %1(s64) = G_UITOFP %0 -... - ---- -# CHECK-LABEL: name: uitofp_s64_s64_fpr -name: uitofp_s64_s64_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = UCVTFUXDri %0 -body: | - bb.0: - liveins: %x0 - - %0(s64) = COPY %x0 - %1(s64) = G_UITOFP %0 -... - ---- -# CHECK-LABEL: name: fptosi_s32_s32_gpr -name: fptosi_s32_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = FCVTZSUWSr %0 -body: | - bb.0: - liveins: %s0 - - %0(s32) = COPY %s0 - %1(s32) = G_FPTOSI %0 -... - ---- -# CHECK-LABEL: name: fptosi_s32_s64_gpr -name: fptosi_s32_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = FCVTZSUWDr %0 -body: | - bb.0: - liveins: %d0 - - %0(s64) = COPY %d0 - %1(s32) = G_FPTOSI %0 -... - ---- -# CHECK-LABEL: name: fptosi_s64_s32_gpr -name: fptosi_s64_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = FCVTZSUXSr %0 -body: | - bb.0: - liveins: %s0 - - %0(s32) = COPY %s0 - %1(s64) = G_FPTOSI %0 -... - ---- -# CHECK-LABEL: name: fptosi_s64_s64_gpr -name: fptosi_s64_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = FCVTZSUXDr %0 -body: | - bb.0: - liveins: %d0 - - %0(s64) = COPY %d0 - %1(s64) = G_FPTOSI %0 -... - ---- -# CHECK-LABEL: name: fptoui_s32_s32_gpr -name: fptoui_s32_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = FCVTZUUWSr %0 -body: | - bb.0: - liveins: %s0 - - %0(s32) = COPY %s0 - %1(s32) = G_FPTOUI %0 -... - ---- -# CHECK-LABEL: name: fptoui_s32_s64_gpr -name: fptoui_s32_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = FCVTZUUWDr %0 -body: | - bb.0: - liveins: %d0 - - %0(s64) = COPY %d0 - %1(s32) = G_FPTOUI %0 -... - ---- -# CHECK-LABEL: name: fptoui_s64_s32_gpr -name: fptoui_s64_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = FCVTZUUXSr %0 -body: | - bb.0: - liveins: %s0 - - %0(s32) = COPY %s0 - %1(s64) = G_FPTOUI %0 -... - ---- -# CHECK-LABEL: name: fptoui_s64_s64_gpr -name: fptoui_s64_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = FCVTZUUXDr %0 -body: | - bb.0: - liveins: %d0 - - %0(s64) = COPY %d0 - %1(s64) = G_FPTOUI %0 -... - ---- -# CHECK-LABEL: name: fptrunc -name: fptrunc -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK: - { id: 0, class: fpr64 } -# CHECK: - { id: 1, class: fpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = FCVTSDr %0 -body: | - bb.0: - liveins: %d0 - - %0(s64) = COPY %d0 - %1(s32) = G_FPTRUNC %0 -... - ---- -# CHECK-LABEL: name: fpext -name: fpext -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK: - { id: 0, class: fpr32 } -# CHECK: - { id: 1, class: fpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = FCVTDSr %0 -body: | - bb.0: - liveins: %d0 - - %0(s32) = COPY %s0 - %1(s64) = G_FPEXT %0 -... - ---- -# CHECK-LABEL: name: unconditional_br -name: unconditional_br -legalized: true -regBankSelected: true - -# CHECK: body: -# CHECK: bb.0: -# CHECK: successors: %bb.0 -# CHECK: B %bb.0 -body: | - bb.0: - successors: %bb.0 - - G_BR %bb.0 -... - ---- -# CHECK-LABEL: name: conditional_br -name: conditional_br -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - -# CHECK: body: -# CHECK: bb.0: -# CHECK: TBNZW %0, 0, %bb.1 -# CHECK: B %bb.0 -body: | - bb.0: - successors: %bb.0, %bb.1 - %0(s1) = COPY %w0 - G_BRCOND %0(s1), %bb.1 - G_BR %bb.0 - - bb.1: -... - ---- -# CHECK-LABEL: name: load_s64_gpr -name: load_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = LDRXui %0, 0 :: (load 8 from %ir.addr) -body: | - bb.0: - liveins: %x0 - - %0(p0) = COPY %x0 - %1(s64) = G_LOAD %0 :: (load 8 from %ir.addr) - -... - ---- -# CHECK-LABEL: name: load_s32_gpr -name: load_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = LDRWui %0, 0 :: (load 4 from %ir.addr) -body: | - bb.0: - liveins: %x0 - - %0(p0) = COPY %x0 - %1(s32) = G_LOAD %0 :: (load 4 from %ir.addr) - -... - ---- -# CHECK-LABEL: name: load_s16_gpr -name: load_s16_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = LDRHHui %0, 0 :: (load 2 from %ir.addr) -body: | - bb.0: - liveins: %x0 - - %0(p0) = COPY %x0 - %1(s16) = G_LOAD %0 :: (load 2 from %ir.addr) - -... - ---- -# CHECK-LABEL: name: load_s8_gpr -name: load_s8_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = LDRBBui %0, 0 :: (load 1 from %ir.addr) -body: | - bb.0: - liveins: %x0 - - %0(p0) = COPY %x0 - %1(s8) = G_LOAD %0 :: (load 1 from %ir.addr) - -... - ---- -# CHECK-LABEL: name: load_s64_fpr -name: load_s64_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: fpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = LDRDui %0, 0 :: (load 8 from %ir.addr) -body: | - bb.0: - liveins: %x0 - - %0(p0) = COPY %x0 - %1(s64) = G_LOAD %0 :: (load 8 from %ir.addr) - -... - ---- -# CHECK-LABEL: name: load_s32_fpr -name: load_s32_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: fpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = LDRSui %0, 0 :: (load 4 from %ir.addr) -body: | - bb.0: - liveins: %x0 - - %0(p0) = COPY %x0 - %1(s32) = G_LOAD %0 :: (load 4 from %ir.addr) - -... - ---- -# CHECK-LABEL: name: load_s16_fpr -name: load_s16_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: fpr16 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = LDRHui %0, 0 :: (load 2 from %ir.addr) -body: | - bb.0: - liveins: %x0 - - %0(p0) = COPY %x0 - %1(s16) = G_LOAD %0 :: (load 2 from %ir.addr) - -... - ---- -# CHECK-LABEL: name: load_s8_fpr -name: load_s8_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: fpr8 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = LDRBui %0, 0 :: (load 1 from %ir.addr) -body: | - bb.0: - liveins: %x0 - - %0(p0) = COPY %x0 - %1(s8) = G_LOAD %0 :: (load 1 from %ir.addr) - -... - ---- -# CHECK-LABEL: name: store_s64_gpr -name: store_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %x1 -# CHECK: STRXui %1, %0, 0 :: (store 8 into %ir.addr) -body: | - bb.0: - liveins: %x0, %x1 - - %0(p0) = COPY %x0 - %1(s64) = COPY %x1 - G_STORE %1, %0 :: (store 8 into %ir.addr) - -... - ---- -# CHECK-LABEL: name: store_s32_gpr -name: store_s32_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %w1 -# CHECK: STRWui %1, %0, 0 :: (store 4 into %ir.addr) -body: | - bb.0: - liveins: %x0, %w1 - - %0(p0) = COPY %x0 - %1(s32) = COPY %w1 - G_STORE %1, %0 :: (store 4 into %ir.addr) - -... - ---- -# CHECK-LABEL: name: store_s16_gpr -name: store_s16_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %w1 -# CHECK: STRHHui %1, %0, 0 :: (store 2 into %ir.addr) -body: | - bb.0: - liveins: %x0, %w1 - - %0(p0) = COPY %x0 - %1(s16) = COPY %w1 - G_STORE %1, %0 :: (store 2 into %ir.addr) - -... - ---- -# CHECK-LABEL: name: store_s8_gpr -name: store_s8_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %w1 -# CHECK: STRBBui %1, %0, 0 :: (store 1 into %ir.addr) -body: | - bb.0: - liveins: %x0, %w1 - - %0(p0) = COPY %x0 - %1(s8) = COPY %w1 - G_STORE %1, %0 :: (store 1 into %ir.addr) - -... - ---- -# CHECK-LABEL: name: store_s64_fpr -name: store_s64_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: fpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %d1 -# CHECK: STRDui %1, %0, 0 :: (store 8 into %ir.addr) -body: | - bb.0: - liveins: %x0, %d1 - - %0(p0) = COPY %x0 - %1(s64) = COPY %d1 - G_STORE %1, %0 :: (store 8 into %ir.addr) - -... - ---- -# CHECK-LABEL: name: store_s32_fpr -name: store_s32_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -# CHECK-NEXT: - { id: 1, class: fpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %s1 -# CHECK: STRSui %1, %0, 0 :: (store 4 into %ir.addr) -body: | - bb.0: - liveins: %x0, %s1 - - %0(p0) = COPY %x0 - %1(s32) = COPY %s1 - G_STORE %1, %0 :: (store 4 into %ir.addr) - -... - ---- -# CHECK-LABEL: name: frame_index -name: frame_index -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp } -registers: - - { id: 0, class: gpr } - -stack: - - { id: 0, name: ptr0, offset: 0, size: 8, alignment: 8 } - -# CHECK: body: -# CHECK: %0 = ADDXri %stack.0.ptr0, 0, 0 -body: | - bb.0: - %0(p0) = G_FRAME_INDEX %stack.0.ptr0 -... - ---- -# Check that we set the "selected" property. -# CHECK-LABEL: name: selected_property -# CHECK: legalized: true -# CHECK-NEXT: regBankSelected: true -# CHECK-NEXT: selected: true -name: selected_property -legalized: true -regBankSelected: true -selected: false -body: | - bb.0: -... - ---- -# CHECK-LABEL: name: const_s32 -name: const_s32 -legalized: true -regBankSelected: true -registers: - - { id: 0, class: gpr } - -# CHECK: body: -# CHECK: %0 = MOVi32imm 42 -body: | - bb.0: - %0(s32) = G_CONSTANT i32 42 -... - ---- -# CHECK-LABEL: name: const_s64 -name: const_s64 -legalized: true -regBankSelected: true -registers: - - { id: 0, class: gpr } - -# CHECK: body: -# CHECK: %0 = MOVi64imm 1234567890123 -body: | - bb.0: - %0(s64) = G_CONSTANT i64 1234567890123 -... - ---- -# CHECK-LABEL: name: fconst_s32 -name: fconst_s32 -legalized: true -regBankSelected: true -registers: - - { id: 0, class: fpr } - -# CHECK: body: -# CHECK: [[TMP:%[0-9]+]] = MOVi32imm 1080033280 -# CHECK: %0 = COPY [[TMP]] -body: | - bb.0: - %0(s32) = G_FCONSTANT float 3.5 -... - ---- -# CHECK-LABEL: name: fconst_s64 -name: fconst_s64 -legalized: true -regBankSelected: true -registers: - - { id: 0, class: fpr } - -# CHECK: body: -# CHECK: [[TMP:%[0-9]+]] = MOVi64imm 4607182418800017408 -# CHECK: %0 = COPY [[TMP]] -body: | - bb.0: - %0(s64) = G_FCONSTANT double 1.0 -... - ---- -# CHECK-LABEL: name: gep -name: gep -legalized: true -regBankSelected: true -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - -# CHECK: body: -# CHECK: %1 = MOVi64imm 42 -# CHECK: %2 = ADDXrr %0, %1 -body: | - bb.0: - liveins: %x0 - %0(p0) = COPY %x0 - %1(s64) = G_CONSTANT i64 42 - %2(p0) = G_GEP %0, %1(s64) -... - ---- -# Global defined in the same linkage unit so no GOT is needed -# CHECK-LABEL: name: global_local -name: global_local -legalized: true -regBankSelected: true -registers: - - { id: 0, class: gpr } - -# CHECK: body: -# IOS: %0 = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local -# LINUX-DEFAULT: %0 = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local -# LINUX-PIC: %0 = LOADgot target-flags(aarch64-got) @var_local -body: | - bb.0: - %0(p0) = G_GLOBAL_VALUE @var_local -... - ---- -# CHECK-LABEL: name: global_got -name: global_got -legalized: true -regBankSelected: true -registers: - - { id: 0, class: gpr } - -# CHECK: body: -# IOS: %0 = LOADgot target-flags(aarch64-got) @var_got -# LINUX-DEFAULT: %0 = MOVaddr target-flags(aarch64-page) @var_got, target-flags(aarch64-pageoff, aarch64-nc) @var_got -# LINUX-PIC: %0 = LOADgot target-flags(aarch64-got) @var_got -body: | - bb.0: - %0(p0) = G_GLOBAL_VALUE @var_got -... - ---- -# CHECK-LABEL: name: trunc -name: trunc -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -# CHECK-NEXT: - { id: 3, class: gpr32 } -# CHECK-NEXT: - { id: 4, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } - - { id: 4, class: gpr } - -# CHECK: body: -# CHECK: %1 = COPY %0 -# CHECK: %3 = COPY %2.sub_32 -# CHECK: %4 = COPY %2.sub_32 -body: | - bb.0: - liveins: %w0, %x0 - - %0(s32) = COPY %w0 - %1(s1) = G_TRUNC %0 - - %2(s64) = COPY %x0 - %3(s32) = G_TRUNC %2 - %4(s8) = G_TRUNC %2 -... - ---- -# CHECK-LABEL: name: anyext_gpr -name: anyext_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32all } -# CHECK-NEXT: - { id: 1, class: gpr64all } -# CHECK-NEXT: - { id: 2, class: gpr32all } -# CHECK-NEXT: - { id: 3, class: gpr32all } -# CHECK-NEXT: - { id: 4, class: gpr64all } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %4 = SUBREG_TO_REG 0, %0, 15 -# CHECK: %1 = COPY %4 -# CHECK: %2 = COPY %w0 -# CHECK: %3 = COPY %2 -body: | - bb.0: - liveins: %w0 - - %0(s32) = COPY %w0 - %1(s64) = G_ANYEXT %0 - %2(s8) = COPY %w0 - %3(s32) = G_ANYEXT %2 -... - ---- -# CHECK-LABEL: name: zext_gpr -name: zext_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -# CHECK-NEXT: - { id: 3, class: gpr32 } -# CHECK-NEXT: - { id: 4, class: gpr32 } -# CHECK-NEXT: - { id: 5, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } - - { id: 4, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %5 = SUBREG_TO_REG 0, %0, 15 -# CHECK: %1 = UBFMXri %5, 0, 31 -# CHECK: %2 = COPY %w0 -# CHECK: %3 = UBFMWri %2, 0, 7 -# CHECK: %4 = UBFMWri %2, 0, 7 -body: | - bb.0: - liveins: %w0 - - %0(s32) = COPY %w0 - %1(s64) = G_ZEXT %0 - %2(s8) = COPY %w0 - %3(s32) = G_ZEXT %2 - %4(s16)= G_ZEXT %2 -... - ---- -# CHECK-LABEL: name: sext_gpr -name: sext_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr64 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -# CHECK-NEXT: - { id: 3, class: gpr32 } -# CHECK-NEXT: - { id: 4, class: gpr32 } -# CHECK-NEXT: - { id: 5, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } - - { id: 4, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %5 = SUBREG_TO_REG 0, %0, 15 -# CHECK: %1 = SBFMXri %5, 0, 31 -# CHECK: %2 = COPY %w0 -# CHECK: %3 = SBFMWri %2, 0, 7 -# CHECK: %4 = SBFMWri %2, 0, 7 -body: | - bb.0: - liveins: %w0 - - %0(s32) = COPY %w0 - %1(s64) = G_SEXT %0 - %2(s8) = COPY %w0 - %3(s32) = G_SEXT %2 - %4(s16) = G_SEXT %2 -... - ---- -# CHECK-LABEL: name: casts -name: casts -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64all } -# CHECK-NEXT: - { id: 1, class: fpr64 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -# CHECK-NEXT: - { id: 3, class: gpr64 } -# CHECK-NEXT: - { id: 4, class: gpr32 } -# CHECK-NEXT: - { id: 5, class: gpr32 } -# CHECK-NEXT: - { id: 6, class: gpr32 } -# CHECK-NEXT: - { id: 7, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } - - { id: 4, class: gpr } - - { id: 5, class: gpr } - - { id: 6, class: gpr } - - { id: 7, class: gpr } -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %0 -# CHECK: %2 = COPY %0 -# CHECK: %3 = COPY %2 -# CHECK: %4 = COPY %2.sub_32 -# CHECK: %5 = COPY %2.sub_32 -# CHECK: %6 = COPY %2.sub_32 -# CHECK: %7 = COPY %2.sub_32 -body: | - bb.0: - liveins: %x0 - %0(s64) = COPY %x0 - %1(<8 x s8>) = G_BITCAST %0(s64) - %2(p0) = G_INTTOPTR %0 - - %3(s64) = G_PTRTOINT %2 - %4(s32) = G_PTRTOINT %2 - %5(s16) = G_PTRTOINT %2 - %6(s8) = G_PTRTOINT %2 - %7(s1) = G_PTRTOINT %2 -... - ---- -# CHECK-LABEL: name: bitcast_s32_gpr -name: bitcast_s32_gpr -legalized: true -regBankSelected: true -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32all } -# CHECK-NEXT: - { id: 1, class: gpr32all } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %0 -body: | - bb.0: - liveins: %w0 - - %0(s32) = COPY %w0 - %1(s32) = G_BITCAST %0 -... - ---- -# CHECK-LABEL: name: bitcast_s32_fpr -name: bitcast_s32_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: fpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = COPY %0 -body: | - bb.0: - liveins: %s0 - - %0(s32) = COPY %s0 - %1(s32) = G_BITCAST %0 -... - ---- -# CHECK-LABEL: name: bitcast_s32_gpr_fpr -name: bitcast_s32_gpr_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32all } -# CHECK-NEXT: - { id: 1, class: fpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %w0 -# CHECK: %1 = COPY %0 -body: | - bb.0: - liveins: %w0 - - %0(s32) = COPY %w0 - %1(s32) = G_BITCAST %0 -... - ---- -# CHECK-LABEL: name: bitcast_s32_fpr_gpr -name: bitcast_s32_fpr_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32all } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %s0 -# CHECK: %1 = COPY %0 -body: | - bb.0: - liveins: %s0 - - %0(s32) = COPY %s0 - %1(s32) = G_BITCAST %0 -... - ---- -# CHECK-LABEL: name: bitcast_s64_gpr -name: bitcast_s64_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64all } -# CHECK-NEXT: - { id: 1, class: gpr64all } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %0 -body: | - bb.0: - liveins: %x0 - - %0(s64) = COPY %x0 - %1(s64) = G_BITCAST %0 -... - ---- -# CHECK-LABEL: name: bitcast_s64_fpr -name: bitcast_s64_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: fpr64 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: fpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = COPY %0 -body: | - bb.0: - liveins: %d0 - - %0(s64) = COPY %d0 - %1(s64) = G_BITCAST %0 -... - ---- -# CHECK-LABEL: name: bitcast_s64_gpr_fpr -name: bitcast_s64_gpr_fpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64all } -# CHECK-NEXT: - { id: 1, class: fpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: fpr } -# CHECK: body: -# CHECK: %0 = COPY %x0 -# CHECK: %1 = COPY %0 -body: | - bb.0: - liveins: %x0 - - %0(s64) = COPY %x0 - %1(s64) = G_BITCAST %0 -... - ---- -# CHECK-LABEL: name: bitcast_s64_fpr_gpr -name: bitcast_s64_fpr_gpr -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr64 } -# CHECK-NEXT: - { id: 1, class: gpr64all } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - -# CHECK: body: -# CHECK: %0 = COPY %d0 -# CHECK: %1 = COPY %0 -body: | - bb.0: - liveins: %d0 - - %0(s64) = COPY %d0 - %1(s64) = G_BITCAST %0 -... - ---- -# CHECK-LABEL: name: icmp -name: icmp -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr64 } -# CHECK-NEXT: - { id: 3, class: gpr32 } -# CHECK-NEXT: - { id: 4, class: gpr64 } -# CHECK-NEXT: - { id: 5, class: gpr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } - - { id: 4, class: gpr } - - { id: 5, class: gpr } - -# CHECK: body: -# CHECK: %wzr = SUBSWrr %0, %0, implicit-def %nzcv -# CHECK: %1 = CSINCWr %wzr, %wzr, 1, implicit %nzcv - -# CHECK: %xzr = SUBSXrr %2, %2, implicit-def %nzcv -# CHECK: %3 = CSINCWr %wzr, %wzr, 3, implicit %nzcv - -# CHECK: %xzr = SUBSXrr %4, %4, implicit-def %nzcv -# CHECK: %5 = CSINCWr %wzr, %wzr, 0, implicit %nzcv - -body: | - bb.0: - liveins: %w0, %x0 - - %0(s32) = COPY %w0 - %1(s1) = G_ICMP intpred(eq), %0, %0 - - %2(s64) = COPY %x0 - %3(s1) = G_ICMP intpred(uge), %2, %2 - - %4(p0) = COPY %x0 - %5(s1) = G_ICMP intpred(ne), %4, %4 -... - ---- -# CHECK-LABEL: name: fcmp -name: fcmp -legalized: true -regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: fpr64 } -# CHECK-NEXT: - { id: 3, class: gpr32 } -# CHECK-NEXT: - { id: 4, class: gpr32 } -# CHECK-NEXT: - { id: 5, class: gpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - - { id: 2, class: fpr } - - { id: 3, class: gpr } - -# CHECK: body: -# CHECK: FCMPSrr %0, %0, implicit-def %nzcv -# CHECK: [[TST_MI:%[0-9]+]] = CSINCWr %wzr, %wzr, 4, implicit %nzcv -# CHECK: [[TST_GT:%[0-9]+]] = CSINCWr %wzr, %wzr, 12, implicit %nzcv -# CHECK: %1 = ORRWrr [[TST_MI]], [[TST_GT]] - -# CHECK: FCMPDrr %2, %2, implicit-def %nzcv -# CHECK: %3 = CSINCWr %wzr, %wzr, 5, implicit %nzcv - -body: | - bb.0: - liveins: %w0, %x0 - - %0(s32) = COPY %s0 - %1(s1) = G_FCMP floatpred(one), %0, %0 - - %2(s64) = COPY %d0 - %3(s1) = G_FCMP floatpred(uge), %2, %2 - -... - ---- -# CHECK-LABEL: name: phi -name: phi -legalized: true -regBankSelected: true -tracksRegLiveness: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: fpr32 } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: bb.1: -# CHECK: %2 = PHI %0, %bb.0, %2, %bb.1 - -body: | - bb.0: - liveins: %s0, %w0 - successors: %bb.1 - %0(s32) = COPY %s0 - %1(s1) = COPY %w0 - - bb.1: - successors: %bb.1, %bb.2 - %2(s32) = PHI %0, %bb.0, %2, %bb.1 - G_BRCOND %1, %bb.1 - - bb.2: - %s0 = COPY %2 - RET_ReallyLR implicit %s0 -... - ---- -# CHECK-LABEL: name: select -name: select -legalized: true -regBankSelected: true -tracksRegLiveness: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32 } -# CHECK-NEXT: - { id: 1, class: gpr32 } -# CHECK-NEXT: - { id: 2, class: gpr32 } -# CHECK-NEXT: - { id: 3, class: gpr32 } -# CHECK-NEXT: - { id: 4, class: gpr64 } -# CHECK-NEXT: - { id: 5, class: gpr64 } -# CHECK-NEXT: - { id: 6, class: gpr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } - - { id: 4, class: gpr } - - { id: 5, class: gpr } - - { id: 6, class: gpr } - -# CHECK: body: -# CHECK: %wzr = ANDSWri %0, 0, implicit-def %nzcv -# CHECK: %3 = CSELWr %1, %2, 1, implicit %nzcv -# CHECK: %wzr = ANDSWri %0, 0, implicit-def %nzcv -# CHECK: %6 = CSELXr %4, %5, 1, implicit %nzcv -body: | - bb.0: - liveins: %w0, %w1, %w2 - %0(s1) = COPY %w0 - - %1(s32) = COPY %w1 - %2(s32) = COPY %w2 - %3(s32) = G_SELECT %0, %1, %2 - - %4(s64) = COPY %x0 - %5(s64) = COPY %x1 - %6(s64) = G_SELECT %0, %4, %5 -... diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll index 579ef777223c..006308641184 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-stackprotect.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-apple-ios %s -stop-after=irtranslator -o - -global-isel | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios %s -stop-after=irtranslator -o - -global-isel | FileCheck %s ; CHECK: name: test_stack_guard diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index 15b4012f383d..02848021dbc0 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -52,18 +52,40 @@ define void @allocai64() { ; CHECK: body: ; ; ABI/constant lowering and IR-level entry basic block. -; CHECK: {{bb.[0-9]+}} (%ir-block.{{[0-9]+}}): +; CHECK: {{bb.[0-9]+}}.entry: ; ; Make sure we have one successor and only one. -; CHECK-NEXT: successors: %[[END:bb.[0-9]+.end]](0x80000000) +; CHECK-NEXT: successors: %[[BB2:bb.[0-9]+.bb2]](0x80000000) ; ; Check that we emit the correct branch. -; CHECK: G_BR %[[END]] +; CHECK: G_BR %[[BB2]] ; ; Check that end contains the return instruction. -; CHECK: [[END]]: +; CHECK: [[END:bb.[0-9]+.end]]: ; CHECK-NEXT: RET_ReallyLR +; +; CHECK: {{bb.[0-9]+}}.bb2: +; CHECK-NEXT: successors: %[[END]](0x80000000) +; CHECK: G_BR %[[END]] define void @uncondbr() { +entry: + br label %bb2 +end: + ret void +bb2: + br label %end +} + +; CHECK-LABEL: name: uncondbr_fallthrough +; CHECK: body: +; CHECK: {{bb.[0-9]+}}.entry: +; CHECK-NEXT: successors: %[[END:bb.[0-9]+.end]](0x80000000) +; We don't emit a branch here, as we can fallthrough to the successor. +; CHECK-NOT: G_BR +; CHECK: [[END]]: +; CHECK-NEXT: RET_ReallyLR +define void @uncondbr_fallthrough() { +entry: br label %end end: ret void @@ -117,33 +139,35 @@ false: ; CHECK: G_BRCOND %[[regicmp100]](s1), %[[BB_CASE100]] ; CHECK: G_BR %[[BB_NOTCASE100_CHECKNEXT]] ; -; CHECK: [[BB_CASE100]]: -; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000) -; CHECK: %[[regretc100:[0-9]+]](s32) = G_ADD %0, %[[reg1]] -; CHECK: G_BR %[[BB_RET]] ; CHECK: [[BB_NOTCASE100_CHECKNEXT]]: ; CHECK-NEXT: successors: %[[BB_CASE200:bb.[0-9]+.case200]](0x40000000), %[[BB_NOTCASE200_CHECKNEXT:bb.[0-9]+.entry]](0x40000000) ; CHECK: %[[regicmp200:[0-9]+]](s1) = G_ICMP intpred(eq), %[[reg200]](s32), %0 ; CHECK: G_BRCOND %[[regicmp200]](s1), %[[BB_CASE200]] ; CHECK: G_BR %[[BB_NOTCASE200_CHECKNEXT]] ; -; CHECK: [[BB_CASE200]]: -; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000) -; CHECK: %[[regretc200:[0-9]+]](s32) = G_ADD %0, %[[reg2]] -; CHECK: G_BR %[[BB_RET]] ; CHECK: [[BB_NOTCASE200_CHECKNEXT]]: ; CHECK-NEXT: successors: %[[BB_DEFAULT:bb.[0-9]+.default]](0x80000000) ; CHECK: G_BR %[[BB_DEFAULT]] ; ; CHECK: [[BB_DEFAULT]]: -; CHECK-NEXT: successors: %[[BB_RET]](0x80000000) +; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000) ; CHECK: %[[regretdefault:[0-9]+]](s32) = G_ADD %0, %[[reg0]] ; CHECK: G_BR %[[BB_RET]] ; +; CHECK: [[BB_CASE100]]: +; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000) +; CHECK: %[[regretc100:[0-9]+]](s32) = G_ADD %0, %[[reg1]] +; CHECK: G_BR %[[BB_RET]] +; +; CHECK: [[BB_CASE200]]: +; CHECK-NEXT: successors: %[[BB_RET]](0x80000000) +; CHECK: %[[regretc200:[0-9]+]](s32) = G_ADD %0, %[[reg2]] +; ; CHECK: [[BB_RET]]: ; CHECK-NEXT: %[[regret:[0-9]+]](s32) = PHI %[[regretdefault]](s32), %[[BB_DEFAULT]], %[[regretc100]](s32), %[[BB_CASE100]] ; CHECK: %w0 = COPY %[[regret]](s32) ; CHECK: RET_ReallyLR implicit %w0 +; define i32 @switch(i32 %argc) { entry: switch i32 %argc, label %default [ @@ -168,6 +192,95 @@ return: ret i32 %res } + ; The switch lowering code changes the CFG, which means that the original + ; %entry block is no longer a predecessor for the phi instruction. We need to + ; use the correct lowered MachineBasicBlock instead. +; CHECK-LABEL: name: test_cfg_remap +; CHECK: {{bb.[0-9]+.entry}}: +; CHECK-NEXT: successors: %{{bb.[0-9]+.next}}(0x40000000), %[[NOTCASE1_BLOCK:bb.[0-9]+.entry]](0x40000000) +; CHECK: [[NOTCASE1_BLOCK]]: +; CHECK-NEXT: successors: %{{bb.[0-9]+.other}}(0x40000000), %[[NOTCASE57_BLOCK:bb.[0-9]+.entry]](0x40000000) +; CHECK: [[NOTCASE57_BLOCK]]: +; CHECK-NEXT: successors: %[[PHI_BLOCK:bb.[0-9]+.phi.block]](0x80000000) +; CHECK: G_BR %[[PHI_BLOCK]] +; +; CHECK: [[PHI_BLOCK]]: +; CHECK-NEXT: PHI %{{.*}}(s32), %[[NOTCASE57_BLOCK:bb.[0-9]+.entry]], %{{.*}}(s32), +; +define i32 @test_cfg_remap(i32 %in) { +entry: + switch i32 %in, label %phi.block [i32 1, label %next + i32 57, label %other] + +next: + br label %phi.block + +other: + ret i32 undef + +phi.block: + %res = phi i32 [1, %entry], [42, %next] + ret i32 %res +} + +; CHECK-LABEL: name: test_cfg_remap_multiple_preds +; CHECK: PHI [[ENTRY:%.*]](s32), %bb.{{[0-9]+}}.entry, [[ENTRY]](s32), %bb.{{[0-9]+}}.entry +define i32 @test_cfg_remap_multiple_preds(i32 %in) { +entry: + switch i32 %in, label %odd [i32 1, label %next + i32 57, label %other + i32 128, label %phi.block + i32 256, label %phi.block] +odd: + unreachable + +next: + br label %phi.block + +other: + ret i32 undef + +phi.block: + %res = phi i32 [1, %entry], [1, %entry], [42, %next] + ret i32 12 +} + +; Tests for indirect br. +; CHECK-LABEL: name: indirectbr +; CHECK: body: +; +; ABI/constant lowering and IR-level entry basic block. +; CHECK: {{bb.[0-9]+.entry}}: +; Make sure we have one successor +; CHECK-NEXT: successors: %[[BB_L1:bb.[0-9]+.L1]](0x80000000) +; CHECK-NOT: G_BR +; +; Check basic block L1 has 2 successors: BBL1 and BBL2 +; CHECK: [[BB_L1]] (address-taken): +; CHECK-NEXT: successors: %[[BB_L1]](0x40000000), +; CHECK: %[[BB_L2:bb.[0-9]+.L2]](0x40000000) +; CHECK: G_BRINDIRECT %{{[0-9]+}}(p0) +; +; Check basic block L2 is the return basic block +; CHECK: [[BB_L2]] (address-taken): +; CHECK-NEXT: RET_ReallyLR + +@indirectbr.L = internal unnamed_addr constant [3 x i8*] [i8* blockaddress(@indirectbr, %L1), i8* blockaddress(@indirectbr, %L2), i8* null], align 8 + +define void @indirectbr() { +entry: + br label %L1 +L1: ; preds = %entry, %L1 + %i = phi i32 [ 0, %entry ], [ %inc, %L1 ] + %inc = add i32 %i, 1 + %idxprom = zext i32 %i to i64 + %arrayidx = getelementptr inbounds [3 x i8*], [3 x i8*]* @indirectbr.L, i64 0, i64 %idxprom + %brtarget = load i8*, i8** %arrayidx, align 8 + indirectbr i8* %brtarget, [label %L1, label %L2] +L2: ; preds = %L1 + ret void +} + ; Tests for or. ; CHECK-LABEL: name: ori64 ; CHECK: [[ARG1:%[0-9]+]](s64) = COPY %x0 @@ -293,11 +406,11 @@ define i64* @trivial_bitcast(i8* %a) { ; CHECK: [[A:%[0-9]+]](p0) = COPY %x0 ; CHECK: G_BR %[[CAST:bb\.[0-9]+.cast]] +; CHECK: [[END:bb\.[0-9]+.end]]: + ; CHECK: [[CAST]]: ; CHECK: {{%[0-9]+}}(p0) = COPY [[A]] -; CHECK: G_BR %[[END:bb\.[0-9]+.end]] - -; CHECK: [[END]]: +; CHECK: G_BR %[[END]] define i64* @trivial_bitcast_with_copy(i8* %a) { br label %cast @@ -375,7 +488,8 @@ define void @store(i64* %addr, i64 addrspace(42)* %addr42, i64 %val1, i64 %val2) ; CHECK-LABEL: name: intrinsics ; CHECK: [[CUR:%[0-9]+]](s32) = COPY %w0 ; CHECK: [[BITS:%[0-9]+]](s32) = COPY %w1 -; CHECK: [[PTR:%[0-9]+]](p0) = G_INTRINSIC intrinsic(@llvm.returnaddress), 0 +; CHECK: [[CREG:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[PTR:%[0-9]+]](p0) = G_INTRINSIC intrinsic(@llvm.returnaddress), [[CREG]] ; CHECK: [[PTR_VEC:%[0-9]+]](p0) = G_FRAME_INDEX %stack.0.ptr.vec ; CHECK: [[VEC:%[0-9]+]](<8 x s8>) = G_LOAD [[PTR_VEC]] ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.st2), [[VEC]](<8 x s8>), [[VEC]](<8 x s8>), [[PTR]](p0) @@ -433,8 +547,8 @@ define void @unreachable(i32 %a) { ; CHECK-LABEL: name: constant_int ; CHECK: [[IN:%[0-9]+]](s32) = COPY %w0 ; CHECK: [[ONE:%[0-9]+]](s32) = G_CONSTANT i32 1 -; CHECK: G_BR +; CHECK: {{bb.[0-9]+}}.next: ; CHECK: [[SUM1:%[0-9]+]](s32) = G_ADD [[IN]], [[ONE]] ; CHECK: [[SUM2:%[0-9]+]](s32) = G_ADD [[IN]], [[ONE]] ; CHECK: [[RES:%[0-9]+]](s32) = G_ADD [[SUM1]], [[SUM2]] @@ -796,7 +910,7 @@ define void @test_extractvalue_agg(%struct.nested* %addr, {i8, i32}* %addr2) { ; CHECK-LABEL: name: test_insertvalue ; CHECK: [[VAL:%[0-9]+]](s32) = COPY %w1 ; CHECK: [[STRUCT:%[0-9]+]](s128) = G_LOAD -; CHECK: [[NEWSTRUCT:%[0-9]+]](s128) = G_INSERT [[STRUCT]](s128), [[VAL]](s32), 64 +; CHECK: [[NEWSTRUCT:%[0-9]+]](s128) = G_INSERT [[STRUCT]], [[VAL]](s32), 64 ; CHECK: G_STORE [[NEWSTRUCT]](s128), define void @test_insertvalue(%struct.nested* %addr, i32 %val) { %struct = load %struct.nested, %struct.nested* %addr @@ -805,10 +919,30 @@ define void @test_insertvalue(%struct.nested* %addr, i32 %val) { ret void } +define [1 x i64] @test_trivial_insert([1 x i64] %s, i64 %val) { +; CHECK-LABEL: name: test_trivial_insert +; CHECK: [[STRUCT:%[0-9]+]](s64) = COPY %x0 +; CHECK: [[VAL:%[0-9]+]](s64) = COPY %x1 +; CHECK: [[RES:%[0-9]+]](s64) = COPY [[VAL]](s64) +; CHECK: %x0 = COPY [[RES]] + %res = insertvalue [1 x i64] %s, i64 %val, 0 + ret [1 x i64] %res +} + +define [1 x i8*] @test_trivial_insert_ptr([1 x i8*] %s, i8* %val) { +; CHECK-LABEL: name: test_trivial_insert_ptr +; CHECK: [[STRUCT:%[0-9]+]](s64) = COPY %x0 +; CHECK: [[VAL:%[0-9]+]](p0) = COPY %x1 +; CHECK: [[RES:%[0-9]+]](s64) = G_PTRTOINT [[VAL]](p0) +; CHECK: %x0 = COPY [[RES]] + %res = insertvalue [1 x i8*] %s, i8* %val, 0 + ret [1 x i8*] %res +} + ; CHECK-LABEL: name: test_insertvalue_agg ; CHECK: [[SMALLSTRUCT:%[0-9]+]](s64) = G_LOAD ; CHECK: [[STRUCT:%[0-9]+]](s128) = G_LOAD -; CHECK: [[RES:%[0-9]+]](s128) = G_INSERT [[STRUCT]](s128), [[SMALLSTRUCT]](s64), 32 +; CHECK: [[RES:%[0-9]+]](s128) = G_INSERT [[STRUCT]], [[SMALLSTRUCT]](s64), 32 ; CHECK: G_STORE [[RES]](s128) define void @test_insertvalue_agg(%struct.nested* %addr, {i8, i32}* %addr2) { %smallstruct = load {i8, i32}, {i8, i32}* %addr2 @@ -840,6 +974,30 @@ define i8* @test_select_ptr(i1 %tst, i8* %lhs, i8* %rhs) { ret i8* %res } +; CHECK-LABEL: name: test_select_vec +; CHECK: [[TST:%[0-9]+]](s1) = COPY %w0 +; CHECK: [[LHS:%[0-9]+]](<4 x s32>) = COPY %q0 +; CHECK: [[RHS:%[0-9]+]](<4 x s32>) = COPY %q1 +; CHECK: [[RES:%[0-9]+]](<4 x s32>) = G_SELECT [[TST]](s1), [[LHS]], [[RHS]] +; CHECK: %q0 = COPY [[RES]] +define <4 x i32> @test_select_vec(i1 %tst, <4 x i32> %lhs, <4 x i32> %rhs) { + %res = select i1 %tst, <4 x i32> %lhs, <4 x i32> %rhs + ret <4 x i32> %res +} + +; CHECK-LABEL: name: test_vselect_vec +; CHECK: [[TST32:%[0-9]+]](<4 x s32>) = COPY %q0 +; CHECK: [[LHS:%[0-9]+]](<4 x s32>) = COPY %q1 +; CHECK: [[RHS:%[0-9]+]](<4 x s32>) = COPY %q2 +; CHECK: [[TST:%[0-9]+]](<4 x s1>) = G_TRUNC [[TST32]](<4 x s32>) +; CHECK: [[RES:%[0-9]+]](<4 x s32>) = G_SELECT [[TST]](<4 x s1>), [[LHS]], [[RHS]] +; CHECK: %q0 = COPY [[RES]] +define <4 x i32> @test_vselect_vec(<4 x i32> %tst32, <4 x i32> %lhs, <4 x i32> %rhs) { + %tst = trunc <4 x i32> %tst32 to <4 x i1> + %res = select <4 x i1> %tst, <4 x i32> %lhs, <4 x i32> %rhs + ret <4 x i32> %res +} + ; CHECK-LABEL: name: test_fptosi ; CHECK: [[FPADDR:%[0-9]+]](p0) = COPY %x0 ; CHECK: [[FP:%[0-9]+]](s32) = G_LOAD [[FPADDR]](p0) @@ -927,6 +1085,19 @@ define void @float_comparison(float* %a.addr, float* %b.addr, i1* %bool.addr) { ret void } +; CHECK-LABEL: name: trivial_float_comparison +; CHECK: [[ENTRY_R1:%[0-9]+]](s1) = G_CONSTANT i1 false +; CHECK: [[ENTRY_R2:%[0-9]+]](s1) = G_CONSTANT i1 true +; CHECK: [[R1:%[0-9]+]](s1) = COPY [[ENTRY_R1]](s1) +; CHECK: [[R2:%[0-9]+]](s1) = COPY [[ENTRY_R2]](s1) +; CHECK: G_ADD [[R1]], [[R2]] +define i1 @trivial_float_comparison(double %a, double %b) { + %r1 = fcmp false double %a, %b + %r2 = fcmp true double %a, %b + %sum = add i1 %r1, %r2 + ret i1 %sum +} + @var = global i32 0 define i32* @test_global() { @@ -969,6 +1140,34 @@ define void @test_memcpy(i8* %dst, i8* %src, i64 %size) { ret void } +declare void @llvm.memmove.p0i8.p0i8.i64(i8*, i8*, i64, i32 %align, i1 %volatile) +define void @test_memmove(i8* %dst, i8* %src, i64 %size) { +; CHECK-LABEL: name: test_memmove +; CHECK: [[DST:%[0-9]+]](p0) = COPY %x0 +; CHECK: [[SRC:%[0-9]+]](p0) = COPY %x1 +; CHECK: [[SIZE:%[0-9]+]](s64) = COPY %x2 +; CHECK: %x0 = COPY [[DST]] +; CHECK: %x1 = COPY [[SRC]] +; CHECK: %x2 = COPY [[SIZE]] +; CHECK: BL $memmove, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %x0, implicit %x1, implicit %x2 + call void @llvm.memmove.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i32 1, i1 0) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32 %align, i1 %volatile) +define void @test_memset(i8* %dst, i8 %val, i64 %size) { +; CHECK-LABEL: name: test_memset +; CHECK: [[DST:%[0-9]+]](p0) = COPY %x0 +; CHECK: [[SRC:%[0-9]+]](s8) = COPY %w1 +; CHECK: [[SIZE:%[0-9]+]](s64) = COPY %x2 +; CHECK: %x0 = COPY [[DST]] +; CHECK: %w1 = COPY [[SRC]] +; CHECK: %x2 = COPY [[SIZE]] +; CHECK: BL $memset, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %x0, implicit %w1, implicit %x2 + call void @llvm.memset.p0i8.i64(i8* %dst, i8 %val, i64 %size, i32 1, i1 0) + ret void +} + declare i64 @llvm.objectsize.i64(i8*, i1) declare i32 @llvm.objectsize.i32(i8*, i1) define void @test_objectsize(i8* %addr0, i8* %addr1) { @@ -1004,9 +1203,341 @@ define i8* @test_const_placement() { ; CHECK: bb.{{[0-9]+}} (%ir-block.{{[0-9]+}}): ; CHECK: [[VAL_INT:%[0-9]+]](s32) = G_CONSTANT i32 42 ; CHECK: [[VAL:%[0-9]+]](p0) = G_INTTOPTR [[VAL_INT]](s32) -; CHECK: G_BR +; CHECK: {{bb.[0-9]+}}.next: br label %next next: ret i8* inttoptr(i32 42 to i8*) } + +declare void @llvm.va_end(i8*) +define void @test_va_end(i8* %list) { +; CHECK-LABEL: name: test_va_end +; CHECK-NOT: va_end +; CHECK-NOT: INTRINSIC +; CHECK: RET_ReallyLR + call void @llvm.va_end(i8* %list) + ret void +} + +define void @test_va_arg(i8* %list) { +; CHECK-LABEL: test_va_arg +; CHECK: [[LIST:%[0-9]+]](p0) = COPY %x0 +; CHECK: G_VAARG [[LIST]](p0), 8 +; CHECK: G_VAARG [[LIST]](p0), 1 +; CHECK: G_VAARG [[LIST]](p0), 16 + + %v0 = va_arg i8* %list, i64 + %v1 = va_arg i8* %list, i8 + %v2 = va_arg i8* %list, i128 + ret void +} + +declare float @llvm.pow.f32(float, float) +define float @test_pow_intrin(float %l, float %r) { +; CHECK-LABEL: name: test_pow_intrin +; CHECK: [[LHS:%[0-9]+]](s32) = COPY %s0 +; CHECK: [[RHS:%[0-9]+]](s32) = COPY %s1 +; CHECK: [[RES:%[0-9]+]](s32) = G_FPOW [[LHS]], [[RHS]] +; CHECK: %s0 = COPY [[RES]] + %res = call float @llvm.pow.f32(float %l, float %r) + ret float %res +} + +declare void @llvm.lifetime.start.p0i8(i64, i8*) +declare void @llvm.lifetime.end.p0i8(i64, i8*) +define void @test_lifetime_intrin() { +; CHECK-LABEL: name: test_lifetime_intrin +; CHECK: RET_ReallyLR + %slot = alloca i8, i32 4 + call void @llvm.lifetime.start.p0i8(i64 0, i8* %slot) + call void @llvm.lifetime.end.p0i8(i64 0, i8* %slot) + ret void +} + +define void @test_load_store_atomics(i8* %addr) { +; CHECK-LABEL: name: test_load_store_atomics +; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x0 +; CHECK: [[V0:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load unordered 1 from %ir.addr) +; CHECK: G_STORE [[V0]](s8), [[ADDR]](p0) :: (store monotonic 1 into %ir.addr) +; CHECK: [[V1:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load acquire 1 from %ir.addr) +; CHECK: G_STORE [[V1]](s8), [[ADDR]](p0) :: (store release 1 into %ir.addr) +; CHECK: [[V2:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load singlethread seq_cst 1 from %ir.addr) +; CHECK: G_STORE [[V2]](s8), [[ADDR]](p0) :: (store singlethread monotonic 1 into %ir.addr) + %v0 = load atomic i8, i8* %addr unordered, align 1 + store atomic i8 %v0, i8* %addr monotonic, align 1 + + %v1 = load atomic i8, i8* %addr acquire, align 1 + store atomic i8 %v1, i8* %addr release, align 1 + + %v2 = load atomic i8, i8* %addr singlethread seq_cst, align 1 + store atomic i8 %v2, i8* %addr singlethread monotonic, align 1 + + ret void +} + +define float @test_fneg_f32(float %x) { +; CHECK-LABEL: name: test_fneg_f32 +; CHECK: [[ARG:%[0-9]+]](s32) = COPY %s0 +; CHECK: [[RES:%[0-9]+]](s32) = G_FNEG [[ARG]] +; CHECK: %s0 = COPY [[RES]](s32) + %neg = fsub float -0.000000e+00, %x + ret float %neg +} + +define double @test_fneg_f64(double %x) { +; CHECK-LABEL: name: test_fneg_f64 +; CHECK: [[ARG:%[0-9]+]](s64) = COPY %d0 +; CHECK: [[RES:%[0-9]+]](s64) = G_FNEG [[ARG]] +; CHECK: %d0 = COPY [[RES]](s64) + %neg = fsub double -0.000000e+00, %x + ret double %neg +} + +define void @test_trivial_inlineasm() { +; CHECK-LABEL: name: test_trivial_inlineasm +; CHECK: INLINEASM $wibble, 1 +; CHECK: INLINEASM $wibble, 0 + call void asm sideeffect "wibble", ""() + call void asm "wibble", ""() + ret void +} + +define <2 x i32> @test_insertelement(<2 x i32> %vec, i32 %elt, i32 %idx){ +; CHECK-LABEL: name: test_insertelement +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = COPY %d0 +; CHECK: [[ELT:%[0-9]+]](s32) = COPY %w0 +; CHECK: [[IDX:%[0-9]+]](s32) = COPY %w1 +; CHECK: [[RES:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[VEC]], [[ELT]](s32), [[IDX]](s32) +; CHECK: %d0 = COPY [[RES]](<2 x s32>) + %res = insertelement <2 x i32> %vec, i32 %elt, i32 %idx + ret <2 x i32> %res +} + +define i32 @test_extractelement(<2 x i32> %vec, i32 %idx) { +; CHECK-LABEL: name: test_extractelement +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = COPY %d0 +; CHECK: [[IDX:%[0-9]+]](s32) = COPY %w0 +; CHECK: [[RES:%[0-9]+]](s32) = G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[IDX]](s32) +; CHECK: %w0 = COPY [[RES]](s32) + %res = extractelement <2 x i32> %vec, i32 %idx + ret i32 %res +} + +define i32 @test_singleelementvector(i32 %elt){ +; CHECK-LABEL: name: test_singleelementvector +; CHECK: [[ELT:%[0-9]+]](s32) = COPY %w0 +; CHECK-NOT: G_INSERT_VECTOR_ELT +; CHECK-NOT: G_EXTRACT_VECTOR_ELT +; CHECK: %w0 = COPY [[ELT]](s32) + %vec = insertelement <1 x i32> undef, i32 %elt, i32 0 + %res = extractelement <1 x i32> %vec, i32 0 + ret i32 %res +} + +define <2 x i32> @test_constantaggzerovector_v2i32() { +; CHECK-LABEL: name: test_constantaggzerovector_v2i32 +; CHECK: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32) +; CHECK: %d0 = COPY [[VEC]](<2 x s32>) + ret <2 x i32> zeroinitializer +} + +define <2 x float> @test_constantaggzerovector_v2f32() { +; CHECK-LABEL: name: test_constantaggzerovector_v2f32 +; CHECK: [[ZERO:%[0-9]+]](s32) = G_FCONSTANT float 0.000000e+00 +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32) +; CHECK: %d0 = COPY [[VEC]](<2 x s32>) + ret <2 x float> zeroinitializer +} + +define i32 @test_constantaggzerovector_v3i32() { +; CHECK-LABEL: name: test_constantaggzerovector_v3i32 +; CHECK: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[VEC:%[0-9]+]](<3 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32), [[ZERO]](s32) +; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>) + %elt = extractelement <3 x i32> zeroinitializer, i32 1 + ret i32 %elt +} + +define <2 x i32> @test_constantdatavector_v2i32() { +; CHECK-LABEL: name: test_constantdatavector_v2i32 +; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2 +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32) +; CHECK: %d0 = COPY [[VEC]](<2 x s32>) + ret <2 x i32> +} + +define i32 @test_constantdatavector_v3i32() { +; CHECK-LABEL: name: test_constantdatavector_v3i32 +; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2 +; CHECK: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3 +; CHECK: [[VEC:%[0-9]+]](<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32), [[C3]](s32) +; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>) + %elt = extractelement <3 x i32> , i32 1 + ret i32 %elt +} + +define <4 x i32> @test_constantdatavector_v4i32() { +; CHECK-LABEL: name: test_constantdatavector_v4i32 +; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2 +; CHECK: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3 +; CHECK: [[C4:%[0-9]+]](s32) = G_CONSTANT i32 4 +; CHECK: [[VEC:%[0-9]+]](<4 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32) +; CHECK: %q0 = COPY [[VEC]](<4 x s32>) + ret <4 x i32> +} + +define <2 x double> @test_constantdatavector_v2f64() { +; CHECK-LABEL: name: test_constantdatavector_v2f64 +; CHECK: [[FC1:%[0-9]+]](s64) = G_FCONSTANT double 1.000000e+00 +; CHECK: [[FC2:%[0-9]+]](s64) = G_FCONSTANT double 2.000000e+00 +; CHECK: [[VEC:%[0-9]+]](<2 x s64>) = G_MERGE_VALUES [[FC1]](s64), [[FC2]](s64) +; CHECK: %q0 = COPY [[VEC]](<2 x s64>) + ret <2 x double> +} + +define i32 @test_constantaggzerovector_v1s32(i32 %arg){ +; CHECK-LABEL: name: test_constantaggzerovector_v1s32 +; CHECK: [[ARG:%[0-9]+]](s32) = COPY %w0 +; CHECK: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK-NOT: G_MERGE_VALUES +; CHECK: G_ADD [[ARG]], [[C0]] + %vec = insertelement <1 x i32> undef, i32 %arg, i32 0 + %add = add <1 x i32> %vec, zeroinitializer + %res = extractelement <1 x i32> %add, i32 0 + ret i32 %res +} + +define i32 @test_constantdatavector_v1s32(i32 %arg){ +; CHECK-LABEL: name: test_constantdatavector_v1s32 +; CHECK: [[ARG:%[0-9]+]](s32) = COPY %w0 +; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK-NOT: G_MERGE_VALUES +; CHECK: G_ADD [[ARG]], [[C1]] + %vec = insertelement <1 x i32> undef, i32 %arg, i32 0 + %add = add <1 x i32> %vec, + %res = extractelement <1 x i32> %add, i32 0 + ret i32 %res +} + +declare ghccc float @different_call_conv_target(float %x) +define float @test_different_call_conv_target(float %x) { +; CHECK-LABEL: name: test_different_call_conv +; CHECK: [[X:%[0-9]+]](s32) = COPY %s0 +; CHECK: %s8 = COPY [[X]] +; CHECK: BL @different_call_conv_target, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %s8, implicit-def %s0 + %res = call ghccc float @different_call_conv_target(float %x) + ret float %res +} + +define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) { +; CHECK-LABEL: name: test_shufflevector_s32_v2s32 +; CHECK: [[ARG:%[0-9]+]](s32) = COPY %w0 +; CHECK-DAG: [[UNDEF:%[0-9]+]](s32) = IMPLICIT_DEF +; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32) +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], [[MASK]](<2 x s32>) +; CHECK: %d0 = COPY [[VEC]](<2 x s32>) + %vec = insertelement <1 x i32> undef, i32 %arg, i32 0 + %res = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer + ret <2 x i32> %res +} + +define i32 @test_shufflevector_v2s32_s32(<2 x i32> %arg) { +; CHECK-LABEL: name: test_shufflevector_v2s32_s32 +; CHECK: [[ARG:%[0-9]+]](<2 x s32>) = COPY %d0 +; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF +; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK: [[RES:%[0-9]+]](s32) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[C1]](s32) +; CHECK: %w0 = COPY [[RES]](s32) + %vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <1 x i32> + %res = extractelement <1 x i32> %vec, i32 0 + ret i32 %res +} + +define <2 x i32> @test_shufflevector_v2s32_v2s32(<2 x i32> %arg) { +; CHECK-LABEL: name: test_shufflevector_v2s32_v2s32 +; CHECK: [[ARG:%[0-9]+]](<2 x s32>) = COPY %d0 +; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF +; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32) +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[MASK]](<2 x s32>) +; CHECK: %d0 = COPY [[VEC]](<2 x s32>) + %res = shufflevector <2 x i32> %arg, <2 x i32> undef, <2 x i32> + ret <2 x i32> %res +} + +define i32 @test_shufflevector_v2s32_v3s32(<2 x i32> %arg) { +; CHECK-LABEL: name: test_shufflevector_v2s32_v3s32 +; CHECK: [[ARG:%[0-9]+]](<2 x s32>) = COPY %d0 +; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF +; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK-DAG: [[MASK:%[0-9]+]](<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32), [[C1]](s32) +; CHECK: [[VEC:%[0-9]+]](<3 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[MASK]](<3 x s32>) +; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>) + %vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <3 x i32> + %res = extractelement <3 x i32> %vec, i32 0 + ret i32 %res +} + +define <4 x i32> @test_shufflevector_v2s32_v4s32(<2 x i32> %arg1, <2 x i32> %arg2) { +; CHECK-LABEL: name: test_shufflevector_v2s32_v4s32 +; CHECK: [[ARG1:%[0-9]+]](<2 x s32>) = COPY %d0 +; CHECK: [[ARG2:%[0-9]+]](<2 x s32>) = COPY %d1 +; CHECK: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2 +; CHECK: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3 +; CHECK: [[MASK:%[0-9]+]](<4 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32) +; CHECK: [[VEC:%[0-9]+]](<4 x s32>) = G_SHUFFLE_VECTOR [[ARG1]](<2 x s32>), [[ARG2]], [[MASK]](<4 x s32>) +; CHECK: %q0 = COPY [[VEC]](<4 x s32>) + %res = shufflevector <2 x i32> %arg1, <2 x i32> %arg2, <4 x i32> + ret <4 x i32> %res +} + +define <2 x i32> @test_shufflevector_v4s32_v2s32(<4 x i32> %arg) { +; CHECK-LABEL: name: test_shufflevector_v4s32_v2s32 +; CHECK: [[ARG:%[0-9]+]](<4 x s32>) = COPY %q0 +; CHECK-DAG: [[UNDEF:%[0-9]+]](<4 x s32>) = IMPLICIT_DEF +; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK-DAG: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3 +; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C3]](s32) +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<4 x s32>), [[UNDEF]], [[MASK]](<2 x s32>) +; CHECK: %d0 = COPY [[VEC]](<2 x s32>) + %res = shufflevector <4 x i32> %arg, <4 x i32> undef, <2 x i32> + ret <2 x i32> %res +} + + +define <16 x i8> @test_shufflevector_v8s8_v16s8(<8 x i8> %arg1, <8 x i8> %arg2) { +; CHECK-LABEL: name: test_shufflevector_v8s8_v16s8 +; CHECK: [[ARG1:%[0-9]+]](<8 x s8>) = COPY %d0 +; CHECK: [[ARG2:%[0-9]+]](<8 x s8>) = COPY %d1 +; CHECK: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[C8:%[0-9]+]](s32) = G_CONSTANT i32 8 +; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK: [[C9:%[0-9]+]](s32) = G_CONSTANT i32 9 +; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2 +; CHECK: [[C10:%[0-9]+]](s32) = G_CONSTANT i32 10 +; CHECK: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3 +; CHECK: [[C11:%[0-9]+]](s32) = G_CONSTANT i32 11 +; CHECK: [[C4:%[0-9]+]](s32) = G_CONSTANT i32 4 +; CHECK: [[C12:%[0-9]+]](s32) = G_CONSTANT i32 12 +; CHECK: [[C5:%[0-9]+]](s32) = G_CONSTANT i32 5 +; CHECK: [[C13:%[0-9]+]](s32) = G_CONSTANT i32 13 +; CHECK: [[C6:%[0-9]+]](s32) = G_CONSTANT i32 6 +; CHECK: [[C14:%[0-9]+]](s32) = G_CONSTANT i32 14 +; CHECK: [[C7:%[0-9]+]](s32) = G_CONSTANT i32 7 +; CHECK: [[C15:%[0-9]+]](s32) = G_CONSTANT i32 15 +; CHECK: [[MASK:%[0-9]+]](<16 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C8]](s32), [[C1]](s32), [[C9]](s32), [[C2]](s32), [[C10]](s32), [[C3]](s32), [[C11]](s32), [[C4]](s32), [[C12]](s32), [[C5]](s32), [[C13]](s32), [[C6]](s32), [[C14]](s32), [[C7]](s32), [[C15]](s32) +; CHECK: [[VEC:%[0-9]+]](<16 x s8>) = G_SHUFFLE_VECTOR [[ARG1]](<8 x s8>), [[ARG2]], [[MASK]](<16 x s32>) +; CHECK: %q0 = COPY [[VEC]](<16 x s8>) + %res = shufflevector <8 x i8> %arg1, <8 x i8> %arg2, <16 x i32> + ret <16 x i8> %res +} diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir index 4c67c0daaf74..739fdd5cb4c5 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir +++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir @@ -1,5 +1,5 @@ -# RUN: llc -O0 -run-pass=regbankselect -global-isel %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=FAST -# RUN: llc -O0 -run-pass=regbankselect -global-isel %s -regbankselect-greedy -o - 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY +# RUN: llc -O0 -run-pass=regbankselect -global-isel %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=FAST +# RUN: llc -O0 -run-pass=regbankselect -global-isel %s -regbankselect-greedy -o - -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY --- | ; ModuleID = 'generic-virtual-registers-type-error.mir' @@ -315,8 +315,8 @@ body: | ; Fast mode tries to reuse the source of the copy for the destination. ; Now, the default mapping says that %0 and %1 need to be in FPR. ; The repairing code insert two copies to materialize that. - ; FAST-NEXT: %3(s64) = COPY %0 - ; FAST-NEXT: %4(s64) = COPY %1 + ; FAST-NEXT: %3(<2 x s32>) = COPY %0 + ; FAST-NEXT: %4(<2 x s32>) = COPY %1 ; The mapping of G_OR is on FPR. ; FAST-NEXT: %2(<2 x s32>) = G_OR %3, %4 @@ -362,13 +362,13 @@ body: | ; Fast mode tries to reuse the source of the copy for the destination. ; Now, the default mapping says that %0 and %1 need to be in FPR. ; The repairing code insert two copies to materialize that. - ; FAST-NEXT: %3(s64) = COPY %0 - ; FAST-NEXT: %4(s64) = COPY %1 + ; FAST-NEXT: %3(<2 x s32>) = COPY %0 + ; FAST-NEXT: %4(<2 x s32>) = COPY %1 ; The mapping of G_OR is on FPR. ; FAST-NEXT: %2(<2 x s32>) = G_OR %3, %4 ; Greedy mode remapped the instruction on the GPR bank. - ; GREEDY-NEXT: %3(s64) = G_OR %0, %1 + ; GREEDY-NEXT: %3(<2 x s32>) = G_OR %0, %1 ; We need to keep %2 into FPR because we do not know anything about it. ; GREEDY-NEXT: %2(<2 x s32>) = COPY %3 %0(<2 x s32>) = COPY %x0 diff --git a/test/CodeGen/AArch64/GlobalISel/call-translator.ll b/test/CodeGen/AArch64/GlobalISel/call-translator.ll index 7bedad38de1a..f8d95c88cc8f 100644 --- a/test/CodeGen/AArch64/GlobalISel/call-translator.ll +++ b/test/CodeGen/AArch64/GlobalISel/call-translator.ll @@ -1,7 +1,9 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s ; CHECK-LABEL: name: test_trivial_call +; CHECK: ADJCALLSTACKDOWN 0, implicit-def %sp, implicit %sp ; CHECK: BL @trivial_callee, csr_aarch64_aapcs, implicit-def %lr +; CHECK: ADJCALLSTACKUP 0, 0, implicit-def %sp, implicit %sp declare void @trivial_callee() define void @test_trivial_call() { call void @trivial_callee() @@ -61,7 +63,13 @@ define void @test_multiple_args(i64 %in) { ; CHECK: [[I64:%[0-9]+]](s64) = COPY %x0 ; CHECK: [[I8:%[0-9]+]](s8) = COPY %w1 ; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x2 -; CHECK: [[ARG:%[0-9]+]](s192) = G_SEQUENCE [[DBL]](s64), 0, [[I64]](s64), 64, [[I8]](s8), 128 + +; CHECK: [[UNDEF:%[0-9]+]](s192) = IMPLICIT_DEF +; CHECK: [[ARG0:%[0-9]+]](s192) = G_INSERT [[UNDEF]], [[DBL]](s64), 0 +; CHECK: [[ARG1:%[0-9]+]](s192) = G_INSERT [[ARG0]], [[I64]](s64), 64 +; CHECK: [[ARG2:%[0-9]+]](s192) = G_INSERT [[ARG1]], [[I8]](s8), 128 +; CHECK: [[ARG:%[0-9]+]](s192) = COPY [[ARG2]] + ; CHECK: G_STORE [[ARG]](s192), [[ADDR]](p0) ; CHECK: RET_ReallyLR define void @test_struct_formal({double, i64, i8} %in, {double, i64, i8}* %addr) { @@ -73,7 +81,11 @@ define void @test_struct_formal({double, i64, i8} %in, {double, i64, i8}* %addr) ; CHECK-LABEL: name: test_struct_return ; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x0 ; CHECK: [[VAL:%[0-9]+]](s192) = G_LOAD [[ADDR]](p0) -; CHECK: [[DBL:%[0-9]+]](s64), [[I64:%[0-9]+]](s64), [[I32:%[0-9]+]](s32) = G_EXTRACT [[VAL]](s192), 0, 64, 128 + +; CHECK: [[DBL:%[0-9]+]](s64) = G_EXTRACT [[VAL]](s192), 0 +; CHECK: [[I64:%[0-9]+]](s64) = G_EXTRACT [[VAL]](s192), 64 +; CHECK: [[I32:%[0-9]+]](s32) = G_EXTRACT [[VAL]](s192), 128 + ; CHECK: %d0 = COPY [[DBL]](s64) ; CHECK: %x0 = COPY [[I64]](s64) ; CHECK: %w1 = COPY [[I32]](s32) @@ -84,8 +96,14 @@ define {double, i64, i32} @test_struct_return({double, i64, i32}* %addr) { } ; CHECK-LABEL: name: test_arr_call +; CHECK: hasCalls: true ; CHECK: [[ARG:%[0-9]+]](s256) = G_LOAD -; CHECK: [[E0:%[0-9]+]](s64), [[E1:%[0-9]+]](s64), [[E2:%[0-9]+]](s64), [[E3:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 0, 64, 128, 192 + +; CHECK: [[E0:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 0 +; CHECK: [[E1:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 64 +; CHECK: [[E2:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 128 +; CHECK: [[E3:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 192 + ; CHECK: %x0 = COPY [[E0]](s64) ; CHECK: %x1 = COPY [[E1]](s64) ; CHECK: %x2 = COPY [[E2]](s64) @@ -168,6 +186,7 @@ define void @test_stack_slots([8 x i64], i64 %lhs, i64 %rhs, i64* %addr) { ; CHECK: [[C42:%[0-9]+]](s64) = G_CONSTANT i64 42 ; CHECK: [[C12:%[0-9]+]](s64) = G_CONSTANT i64 12 ; CHECK: [[PTR:%[0-9]+]](p0) = G_CONSTANT i64 0 +; CHECK: ADJCALLSTACKDOWN 24, implicit-def %sp, implicit %sp ; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp ; CHECK: [[C42_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 0 ; CHECK: [[C42_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[C42_OFFS]](s64) @@ -181,6 +200,7 @@ define void @test_stack_slots([8 x i64], i64 %lhs, i64 %rhs, i64* %addr) { ; CHECK: [[PTR_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[PTR_OFFS]](s64) ; CHECK: G_STORE [[PTR]](p0), [[PTR_LOC]](p0) :: (store 8 into stack + 16, align 0) ; CHECK: BL @test_stack_slots +; CHECK: ADJCALLSTACKUP 24, 0, implicit-def %sp, implicit %sp define void @test_call_stack() { call void @test_stack_slots([8 x i64] undef, i64 42, i64 12, i64* null) ret void diff --git a/test/CodeGen/AArch64/GlobalISel/debug-insts.ll b/test/CodeGen/AArch64/GlobalISel/debug-insts.ll new file mode 100644 index 000000000000..5a76661180f2 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/debug-insts.ll @@ -0,0 +1,68 @@ +; RUN: llc -global-isel -mtriple=aarch64 %s -stop-after=irtranslator -o - | FileCheck %s +; RUN: llc -mtriple=aarch64 -global-isel --global-isel-abort=0 -o /dev/null + +; CHECK-LABEL: name: debug_declare +; CHECK: stack: +; CHECK: - { id: {{.*}}, name: in.addr, offset: {{.*}}, size: {{.*}}, alignment: {{.*}}, di-variable: '!11', +; CHECK-NEXT: di-expression: '!12', di-location: '!13' } +; CHECK: DBG_VALUE debug-use %0(s32), debug-use _, !11, !12, debug-location !13 +define void @debug_declare(i32 %in) #0 !dbg !7 { +entry: + %in.addr = alloca i32, align 4 + store i32 %in, i32* %in.addr, align 4 + call void @llvm.dbg.declare(metadata i32* %in.addr, metadata !11, metadata !12), !dbg !13 + call void @llvm.dbg.declare(metadata i32 %in, metadata !11, metadata !12), !dbg !13 + ret void, !dbg !14 +} + +; CHECK-LABEL: name: debug_declare_vla +; CHECK: DBG_VALUE debug-use %{{[0-9]+}}(p0), debug-use _, !11, !12, debug-location !13 +define void @debug_declare_vla(i32 %in) #0 !dbg !7 { +entry: + %vla.addr = alloca i32, i32 %in + call void @llvm.dbg.declare(metadata i32* %vla.addr, metadata !11, metadata !12), !dbg !13 + ret void, !dbg !14 +} + +; CHECK-LABEL: name: debug_value +; CHECK: [[IN:%[0-9]+]](s32) = COPY %w0 +define void @debug_value(i32 %in) #0 !dbg !7 { + %addr = alloca i32 +; CHECK: DBG_VALUE debug-use [[IN]](s32), debug-use _, !11, !12, debug-location !13 + call void @llvm.dbg.value(metadata i32 %in, i64 0, metadata !11, metadata !12), !dbg !13 + store i32 %in, i32* %addr +; CHECK: DBG_VALUE debug-use %1(p0), debug-use _, !11, !15, debug-location !13 + call void @llvm.dbg.value(metadata i32* %addr, i64 0, metadata !11, metadata !15), !dbg !13 +; CHECK: DBG_VALUE 123, 0, !11, !12, debug-location !13 + call void @llvm.dbg.value(metadata i32 123, i64 0, metadata !11, metadata !12), !dbg !13 +; CHECK: DBG_VALUE float 1.000000e+00, 0, !11, !12, debug-location !13 + call void @llvm.dbg.value(metadata float 1.000000e+00, i64 0, metadata !11, metadata !12), !dbg !13 +; CHECK: DBG_VALUE _, 0, !11, !12, debug-location !13 + call void @llvm.dbg.value(metadata i32* null, i64 0, metadata !11, metadata !12), !dbg !13 + ret void +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 289075) (llvm/trunk 289080)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "tmp.c", directory: "/Users/tim/llvm/build") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"PIC Level", i32 2} +!6 = !{!"clang version 4.0.0 (trunk 289075) (llvm/trunk 289080)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DILocalVariable(name: "in", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!12 = !DIExpression() +!13 = !DILocation(line: 1, column: 14, scope: !7) +!14 = !DILocation(line: 2, column: 1, scope: !7) +!15 = !DIExpression(DW_OP_deref) diff --git a/test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll b/test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll new file mode 100644 index 000000000000..196910e96ce3 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll @@ -0,0 +1,48 @@ +; RUN: llc -mtriple=aarch64 -global-isel %s -o - -stop-after=irtranslator | FileCheck %s + +; CHECK-LABEL: name: test_simple_alloca +; CHECK: [[NUMELTS:%[0-9]+]](s32) = COPY %w0 +; CHECK: [[TYPE_SIZE:%[0-9]+]](s64) = G_CONSTANT i64 -1 +; CHECK: [[NUMELTS_64:%[0-9]+]](s64) = G_ZEXT [[NUMELTS]](s32) +; CHECK: [[NUMBYTES:%[0-9]+]](s64) = G_MUL [[NUMELTS_64]], [[TYPE_SIZE]] +; CHECK: [[SP_TMP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[ALLOC:%[0-9]+]](p0) = G_GEP [[SP_TMP]], [[NUMBYTES]] +; CHECK: [[ALIGNED_ALLOC:%[0-9]+]](p0) = G_PTR_MASK [[ALLOC]], 4 +; CHECK: %sp = COPY [[ALIGNED_ALLOC]] +; CHECK: [[ALLOC:%[0-9]+]](p0) = COPY [[ALIGNED_ALLOC]] +; CHECK: %x0 = COPY [[ALLOC]] +define i8* @test_simple_alloca(i32 %numelts) { + %addr = alloca i8, i32 %numelts + ret i8* %addr +} + +; CHECK-LABEL: name: test_aligned_alloca +; CHECK: [[NUMELTS:%[0-9]+]](s32) = COPY %w0 +; CHECK: [[TYPE_SIZE:%[0-9]+]](s64) = G_CONSTANT i64 -1 +; CHECK: [[NUMELTS_64:%[0-9]+]](s64) = G_ZEXT [[NUMELTS]](s32) +; CHECK: [[NUMBYTES:%[0-9]+]](s64) = G_MUL [[NUMELTS_64]], [[TYPE_SIZE]] +; CHECK: [[SP_TMP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[ALLOC:%[0-9]+]](p0) = G_GEP [[SP_TMP]], [[NUMBYTES]] +; CHECK: [[ALIGNED_ALLOC:%[0-9]+]](p0) = G_PTR_MASK [[ALLOC]], 5 +; CHECK: %sp = COPY [[ALIGNED_ALLOC]] +; CHECK: [[ALLOC:%[0-9]+]](p0) = COPY [[ALIGNED_ALLOC]] +; CHECK: %x0 = COPY [[ALLOC]] +define i8* @test_aligned_alloca(i32 %numelts) { + %addr = alloca i8, i32 %numelts, align 32 + ret i8* %addr +} + +; CHECK-LABEL: name: test_natural_alloca +; CHECK: [[NUMELTS:%[0-9]+]](s32) = COPY %w0 +; CHECK: [[TYPE_SIZE:%[0-9]+]](s64) = G_CONSTANT i64 -16 +; CHECK: [[NUMELTS_64:%[0-9]+]](s64) = G_ZEXT [[NUMELTS]](s32) +; CHECK: [[NUMBYTES:%[0-9]+]](s64) = G_MUL [[NUMELTS_64]], [[TYPE_SIZE]] +; CHECK: [[SP_TMP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[ALLOC:%[0-9]+]](p0) = G_GEP [[SP_TMP]], [[NUMBYTES]] +; CHECK: %sp = COPY [[ALLOC]] +; CHECK: [[ALLOC_TMP:%[0-9]+]](p0) = COPY [[ALLOC]] +; CHECK: %x0 = COPY [[ALLOC_TMP]] +define i128* @test_natural_alloca(i32 %numelts) { + %addr = alloca i128, i32 %numelts + ret i128* %addr +} diff --git a/test/CodeGen/AArch64/GlobalISel/gisel-abort.ll b/test/CodeGen/AArch64/GlobalISel/gisel-abort.ll index 76eafdd5af5e..a1480c46fe40 100644 --- a/test/CodeGen/AArch64/GlobalISel/gisel-abort.ll +++ b/test/CodeGen/AArch64/GlobalISel/gisel-abort.ll @@ -1,4 +1,4 @@ -; RUN: llc -march aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s +; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s ; CHECK-NOT: fallback ; CHECK: empty diff --git a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll new file mode 100644 index 000000000000..3ecdb7bbedfb --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll @@ -0,0 +1,48 @@ +; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ +; RUN: -O0 -aarch64-enable-global-isel-at-O=0 \ +; RUN: | FileCheck %s --check-prefix ENABLED --check-prefix NOFALLBACK + +; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ +; RUN: -O0 -aarch64-enable-global-isel-at-O=0 -global-isel-abort=2 \ +; RUN: | FileCheck %s --check-prefix ENABLED --check-prefix FALLBACK + +; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ +; RUN: -global-isel \ +; RUN: | FileCheck %s --check-prefix ENABLED --check-prefix NOFALLBACK + +; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ +; RUN: -global-isel -global-isel-abort=2 \ +; RUN: | FileCheck %s --check-prefix ENABLED --check-prefix FALLBACK + +; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ +; RUN: -O1 -aarch64-enable-global-isel-at-O=3 \ +; RUN: | FileCheck %s --check-prefix ENABLED + +; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ +; RUN: -O1 -aarch64-enable-global-isel-at-O=0 \ +; RUN: | FileCheck %s --check-prefix DISABLED + +; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ +; RUN: -aarch64-enable-global-isel-at-O=-1 \ +; RUN: | FileCheck %s --check-prefix DISABLED + +; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ +; RUN: | FileCheck %s --check-prefix DISABLED + +; ENABLED: IRTranslator +; ENABLED-NEXT: Legalizer +; ENABLED-NEXT: RegBankSelect +; ENABLED-NEXT: InstructionSelect +; ENABLED-NEXT: ResetMachineFunction + +; FALLBACK: AArch64 Instruction Selection +; NOFALLBACK-NOT: AArch64 Instruction Selection + +; DISABLED-NOT: IRTranslator + +; DISABLED: AArch64 Instruction Selection +; DISABLED: Expand ISel Pseudo-instructions + +define void @empty() { + ret void +} diff --git a/test/CodeGen/AArch64/GlobalISel/gisel-fail-intermediate-legalizer.ll b/test/CodeGen/AArch64/GlobalISel/gisel-fail-intermediate-legalizer.ll new file mode 100644 index 000000000000..e333f742e04d --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/gisel-fail-intermediate-legalizer.ll @@ -0,0 +1,8 @@ +;RUN: llc -mtriple=aarch64-unknown-unknown -o - -global-isel -global-isel-abort=2 %s 2>&1 | FileCheck %s +; CHECK: fallback +; CHECK-LABEL: foo +define i16 @foo(half* %p) { + %tmp0 = load half, half* %p + %tmp1 = fptoui half %tmp0 to i16 + ret i16 %tmp1 +} diff --git a/test/CodeGen/AArch64/GlobalISel/inline-asm.ll b/test/CodeGen/AArch64/GlobalISel/inline-asm.ll new file mode 100644 index 000000000000..8ff7c4495dcc --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/inline-asm.ll @@ -0,0 +1,10 @@ +; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - | FileCheck %s + +; CHECK-LABEL: test_asm: +; CHECK: {{APP|InlineAsm Start}} +; CHECK: mov x0, {{x[0-9]+}} +; CHECK: {{NO_APP|InlineAsm End}} +define void @test_asm() { + call void asm sideeffect "mov x0, $0", "r"(i64 42) + ret void +} diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-bitcast.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-bitcast.ll new file mode 100644 index 000000000000..8d1b02216ea7 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-bitcast.ll @@ -0,0 +1,30 @@ +; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -stop-after=irtranslator %s -o - | FileCheck %s + +; Check that we don't invalidate the vreg map. +; This test is brittle: the invalidation only triggers when we grow the map. + +; CHECK-LABEL: name: test_bitcast_invalid_vreg +define i32 @test_bitcast_invalid_vreg() { + %tmp0 = add i32 1, 2 + %tmp1 = add i32 3, 4 + %tmp2 = add i32 5, 6 + %tmp3 = add i32 7, 8 + %tmp4 = add i32 9, 10 + %tmp5 = add i32 11, 12 + %tmp6 = add i32 13, 14 + %tmp7 = add i32 15, 16 + %tmp8 = add i32 17, 18 + %tmp9 = add i32 19, 20 + %tmp10 = add i32 21, 22 + %tmp11 = add i32 23, 24 + %tmp12 = add i32 25, 26 + %tmp13 = add i32 27, 28 + %tmp14 = add i32 29, 30 + %tmp15 = add i32 30, 30 + +; At this point we mapped 46 values. The 'i32 100' constant will grow the map. +; CHECK: %46(s32) = G_CONSTANT i32 100 +; CHECK: %w0 = COPY %46(s32) + %res = bitcast i32 100 to i32 + ret i32 %res +} diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll index 718364af2aca..ef4445111d7b 100644 --- a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll +++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll @@ -6,7 +6,7 @@ declare i32 @foo(i32) declare i32 @__gxx_personality_v0(...) declare i32 @llvm.eh.typeid.for(i8*) -; CHECK: name: bar +; CHECK-LABEL: name: bar ; CHECK: body: ; CHECK-NEXT: bb.1 (%ir-block.0): ; CHECK: successors: %[[GOOD:bb.[0-9]+.continue]]{{.*}}%[[BAD:bb.[0-9]+.broken]] @@ -15,19 +15,24 @@ declare i32 @llvm.eh.typeid.for(i8*) ; CHECK: BL @foo, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %w0, implicit-def %w0 ; CHECK: {{%[0-9]+}}(s32) = COPY %w0 ; CHECK: EH_LABEL +; CHECK: G_BR %[[GOOD]] ; CHECK: [[BAD]] (landing-pad): ; CHECK: EH_LABEL +; CHECK: [[UNDEF:%[0-9]+]](s128) = IMPLICIT_DEF ; CHECK: [[PTR:%[0-9]+]](p0) = COPY %x0 -; CHECK: [[SEL:%[0-9]+]](p0) = COPY %x1 -; CHECK: [[PTR_SEL:%[0-9]+]](s128) = G_SEQUENCE [[PTR]](p0), 0, [[SEL]](p0), 64 -; CHECK: [[PTR_RET:%[0-9]+]](s64), [[SEL_RET:%[0-9]+]](s32) = G_EXTRACT [[PTR_SEL]](s128), 0, 64 +; CHECK: [[VAL_WITH_PTR:%[0-9]+]](s128) = G_INSERT [[UNDEF]], [[PTR]](p0), 0 +; CHECK: [[SEL_PTR:%[0-9]+]](p0) = COPY %x1 +; CHECK: [[SEL:%[0-9]+]](s32) = G_PTRTOINT [[SEL_PTR]] +; CHECK: [[PTR_SEL:%[0-9]+]](s128) = G_INSERT [[VAL_WITH_PTR]], [[SEL]](s32), 64 +; CHECK: [[PTR_RET:%[0-9]+]](s64) = G_EXTRACT [[PTR_SEL]](s128), 0 +; CHECK: [[SEL_RET:%[0-9]+]](s32) = G_EXTRACT [[PTR_SEL]](s128), 64 ; CHECK: %x0 = COPY [[PTR_RET]] ; CHECK: %w1 = COPY [[SEL_RET]] ; CHECK: [[GOOD]]: ; CHECK: [[SEL:%[0-9]+]](s32) = G_CONSTANT i32 1 -; CHECK: {{%[0-9]+}}(s128) = G_INSERT {{%[0-9]+}}(s128), [[SEL]](s32), 64 +; CHECK: {{%[0-9]+}}(s128) = G_INSERT {{%[0-9]+}}, [[SEL]](s32), 64 define { i8*, i32 } @bar() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { %res32 = invoke i32 @foo(i32 42) to label %continue unwind label %broken @@ -42,3 +47,48 @@ continue: %res.good = insertvalue { i8*, i32 } undef, i32 %sel.int, 1 ret { i8*, i32 } %res.good } + +; CHECK-LABEL: name: test_invoke_indirect +; CHECK: [[CALLEE:%[0-9]+]](p0) = COPY %x0 +; CHECK: BLR [[CALLEE]] +define void @test_invoke_indirect(void()* %callee) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + invoke void %callee() to label %continue unwind label %broken + +broken: + landingpad { i8*, i32 } catch i8* bitcast(i8** @_ZTIi to i8*) + ret void + +continue: + ret void +} + +; CHECK-LABEL: name: test_invoke_varargs + +; CHECK: [[NULL:%[0-9]+]](p0) = G_CONSTANT i64 0 +; CHECK: [[ANSWER:%[0-9]+]](s32) = G_CONSTANT i32 42 +; CHECK: [[ONE:%[0-9]+]](s32) = G_FCONSTANT float 1.0 + +; CHECK: %x0 = COPY [[NULL]] + +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFFSET:%[0-9]+]](s64) = G_CONSTANT i64 0 +; CHECK: [[SLOT:%[0-9]+]](p0) = G_GEP [[SP]], [[OFFSET]](s64) +; CHECK: G_STORE [[ANSWER]](s32), [[SLOT]] + +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFFSET:%[0-9]+]](s64) = G_CONSTANT i64 8 +; CHECK: [[SLOT:%[0-9]+]](p0) = G_GEP [[SP]], [[OFFSET]](s64) +; CHECK: G_STORE [[ONE]](s32), [[SLOT]] + +; CHECK: BL @printf +declare void @printf(i8*, ...) +define void @test_invoke_varargs() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + invoke void(i8*, ...) @printf(i8* null, i32 42, float 1.0) to label %continue unwind label %broken + +broken: + landingpad { i8*, i32 } catch i8* bitcast(i8** @_ZTIi to i8*) + ret void + +continue: + ret void +} diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir index 252e60c6b2ec..9b27198b961a 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -33,14 +33,14 @@ body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 ; CHECK-LABEL: name: test_scalar_add_big - ; CHECK-NOT: G_EXTRACT - ; CHECK-NOT: G_SEQUENCE + ; CHECK-NOT: G_MERGE_VALUES + ; CHECK-NOT: G_UNMERGE_VALUES ; CHECK-DAG: [[CARRY0_32:%.*]](s32) = G_CONSTANT i32 0 ; CHECK-DAG: [[CARRY0:%[0-9]+]](s1) = G_TRUNC [[CARRY0_32]] ; CHECK: [[RES_LO:%.*]](s64), [[CARRY:%.*]](s1) = G_UADDE %0, %2, [[CARRY0]] ; CHECK: [[RES_HI:%.*]](s64), {{%.*}}(s1) = G_UADDE %1, %3, [[CARRY]] - ; CHECK-NOT: G_EXTRACT - ; CHECK-NOT: G_SEQUENCE + ; CHECK-NOT: G_MERGE_VALUES + ; CHECK-NOT: G_UNMERGE_VALUES ; CHECK: %x0 = COPY [[RES_LO]] ; CHECK: %x1 = COPY [[RES_HI]] @@ -48,10 +48,10 @@ body: | %1(s64) = COPY %x1 %2(s64) = COPY %x2 %3(s64) = COPY %x3 - %4(s128) = G_SEQUENCE %0, 0, %1, 64 - %5(s128) = G_SEQUENCE %2, 0, %3, 64 + %4(s128) = G_MERGE_VALUES %0, %1 + %5(s128) = G_MERGE_VALUES %2, %3 %6(s128) = G_ADD %4, %5 - %7(s64), %8(s64) = G_EXTRACT %6, 0, 64 + %7(s64), %8(s64) = G_UNMERGE_VALUES %6 %x0 = COPY %7 %x1 = COPY %8 ... @@ -69,7 +69,10 @@ body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 ; CHECK-LABEL: name: test_scalar_add_small - ; CHECK: [[RES:%.*]](s8) = G_ADD %2, %3 + ; CHECK: [[OP0:%.*]](s32) = G_ANYEXT %2(s8) + ; CHECK: [[OP1:%.*]](s32) = G_ANYEXT %3(s8) + ; CHECK: [[RES32:%.*]](s32) = G_ADD [[OP0]], [[OP1]] + ; CHECK: [[RES:%.*]](s8) = G_TRUNC [[RES32]](s32) %0(s64) = COPY %x0 %1(s64) = COPY %x1 @@ -109,10 +112,10 @@ body: | %1(<2 x s64>) = COPY %q1 %2(<2 x s64>) = COPY %q2 %3(<2 x s64>) = COPY %q3 - %4(<4 x s64>) = G_SEQUENCE %0, 0, %1, 128 - %5(<4 x s64>) = G_SEQUENCE %2, 0, %3, 128 + %4(<4 x s64>) = G_MERGE_VALUES %0, %1 + %5(<4 x s64>) = G_MERGE_VALUES %2, %3 %6(<4 x s64>) = G_ADD %4, %5 - %7(<2 x s64>), %8(<2 x s64>) = G_EXTRACT %6, 0, 128 + %7(<2 x s64>), %8(<2 x s64>) = G_UNMERGE_VALUES %6 %q0 = COPY %7 %q1 = COPY %8 ... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-and.mir b/test/CodeGen/AArch64/GlobalISel/legalize-and.mir index 69459bfacb0a..75e1d5163532 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-and.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-and.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -22,7 +22,10 @@ body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 ; CHECK-LABEL: name: test_scalar_and_small - ; CHECK: %4(s8) = G_AND %2, %3 + ; CHECK: [[OP0:%.*]](s32) = G_ANYEXT %2(s8) + ; CHECK: [[OP1:%.*]](s32) = G_ANYEXT %3(s8) + ; CHECK: [[RES32:%.*]](s32) = G_AND [[OP0]], [[OP1]] + ; CHECK: [[RES:%.*]](s8) = G_TRUNC [[RES32]](s32) %0(s64) = COPY %x0 %1(s64) = COPY %x1 diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir index 926a62761ce0..29f83b362895 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-combines.mir b/test/CodeGen/AArch64/GlobalISel/legalize-combines.mir index cc1dc80488ba..fab6dcf43346 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-combines.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-combines.mir @@ -1,92 +1,132 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "aarch64--" - define void @test_combines() { - entry: - ret void - } + define void @test_combines_1() { ret void } + define void @test_combines_2() { ret void } + define void @test_combines_3() { ret void } + define void @test_combines_4() { ret void } + define void @test_combines_5() { ret void } + define void @test_combines_6() { ret void } ... --- -name: test_combines -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } - - { id: 6, class: _ } - - { id: 7, class: _ } - - { id: 8, class: _ } - - { id: 9, class: _ } - - { id: 10, class: _ } - - { id: 11, class: _ } - - { id: 12, class: _ } - - { id: 13, class: _ } - - { id: 14, class: _ } - - { id: 15, class: _ } - - { id: 16, class: _ } - - { id: 17, class: _ } - - { id: 18, class: _ } - - { id: 19, class: _ } - - { id: 20, class: _ } - - { id: 21, class: _ } - - { id: 22, class: _ } - - { id: 23, class: _ } - - { id: 24, class: _ } +name: test_combines_1 body: | - bb.0.entry: - liveins: %w0, %w1, %x2, %x3 + bb.0: + liveins: %w0 - %0(s32) = COPY %w0 - %1(s32) = COPY %w1 - %2(s8) = G_TRUNC %0 + %0:_(s32) = COPY %w0 + %1:_(s8) = G_TRUNC %0 ; Only one of these extracts can be eliminated, the offsets don't match ; properly in the other cases. - ; CHECK-LABEL: name: test_combines - ; CHECK: %3(s32) = G_SEQUENCE %2(s8), 1 - ; CHECK: %4(s8) = G_EXTRACT %3(s32), 0 + ; CHECK-LABEL: name: test_combines_1 + ; CHECK: %2(s32) = G_SEQUENCE %1(s8), 1 + ; CHECK: %3(s8) = G_EXTRACT %2(s32), 0 ; CHECK-NOT: G_EXTRACT - ; CHECK: %6(s8) = G_EXTRACT %3(s32), 2 - ; CHECK: %7(s32) = G_ZEXT %2(s8) - %3(s32) = G_SEQUENCE %2, 1 - %4(s8) = G_EXTRACT %3, 0 - %5(s8) = G_EXTRACT %3, 1 - %6(s8) = G_EXTRACT %3, 2 - %7(s32) = G_ZEXT %5 + ; CHECK: %5(s8) = G_EXTRACT %2(s32), 2 + ; CHECK: %6(s32) = G_ZEXT %1(s8) + + %2:_(s32) = G_SEQUENCE %1, 1 + %3:_(s8) = G_EXTRACT %2, 0 + %4:_(s8) = G_EXTRACT %2, 1 + %5:_(s8) = G_EXTRACT %2, 2 + %6:_(s32) = G_ZEXT %4 +... + +--- +name: test_combines_2 +body: | + bb.0: + liveins: %w0 + + %0:_(s32) = COPY %w0 ; Similarly, here the types don't match. - ; CHECK: %10(s32) = G_SEQUENCE %8(s16), 0, %9(s16), 16 - ; CHECK: %11(s1) = G_EXTRACT %10(s32), 0 - ; CHECK: %12(s32) = G_EXTRACT %10(s32), 0 - %8(s16) = G_TRUNC %0 - %9(s16) = G_ADD %8, %8 - %10(s32) = G_SEQUENCE %8, 0, %9, 16 - %11(s1) = G_EXTRACT %10, 0 - %12(s32) = G_EXTRACT %10, 0 + ; CHECK-LABEL: name: test_combines_2 + ; CHECK: %2(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 32 + ; CHECK: %3(s1) = G_EXTRACT %2(s64), 0 + ; CHECK: %4(s64) = G_EXTRACT %2(s64), 0 + %1:_(s32) = G_ADD %0, %0 + %2:_(s64) = G_SEQUENCE %0, 0, %1, 32 + %3:_(s1) = G_EXTRACT %2, 0 + %4:_(s64) = G_EXTRACT %2, 0 +... + +--- +name: test_combines_3 +body: | + bb.0: + liveins: %w0 + + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_combines_3 + ; CHECK: %1(s32) = G_ADD %0, %0 + ; CHECK-NOT: G_SEQUENCE + ; CHECK-NOT: G_EXTRACT + ; CHECK: %5(s32) = G_ADD %0, %1 + %1:_(s32) = G_ADD %0, %0 + %2:_(s64) = G_SEQUENCE %0, 0, %1, 32 + %3:_(s32) = G_EXTRACT %2, 0 + %4:_(s32) = G_EXTRACT %2, 32 + %5:_(s32) = G_ADD %3, %4 +... +--- +name: test_combines_4 +body: | + bb.0: + liveins: %x0 + + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_combines_4 + ; CHECK: %2(<2 x s32>) = G_EXTRACT %1(s128), 0 + ; CHECK: %3(<2 x s32>) = G_ADD %2, %2 + %1:_(s128) = G_SEQUENCE %0, 0, %0, 64 + %2:_(<2 x s32>) = G_EXTRACT %1, 0 + %3:_(<2 x s32>) = G_ADD %2, %2 +... + +--- +name: test_combines_5 +body: | + bb.0: + liveins: %w0 + + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_combines_5 + ; CHECK-NOT: G_SEQUENCE ; CHECK-NOT: G_EXTRACT - ; CHECK: %15(s16) = G_ADD %8, %9 - %13(s16), %14(s16) = G_EXTRACT %10, 0, 16 - %15(s16) = G_ADD %13, %14 + ; CHECK: %5(s32) = G_ADD %0, %1 + %1:_(s32) = G_ADD %0, %0 + %2:_(s64) = G_SEQUENCE %0, 0, %1, 32 + %3:_(s32) = G_EXTRACT %2, 0 + %4:_(s32) = G_EXTRACT %2, 32 + %5:_(s32) = G_ADD %3, %4 +... + +--- +name: test_combines_6 +body: | + bb.0: + liveins: %w0 - ; CHECK: %18(<2 x s32>) = G_EXTRACT %17(s128), 0 - ; CHECK: %19(<2 x s32>) = G_ADD %18, %18 - %16(s64) = COPY %x0 - %17(s128) = G_SEQUENCE %16, 0, %16, 64 - %18(<2 x s32>) = G_EXTRACT %17, 0 - %19(<2 x s32>) = G_ADD %18, %18 + ; CHECK-LABEL: name: test_combines_6 + ; CHECK: %0(s32) = COPY %w0 + %0:_(s32) = COPY %w0 + ; Check that we replace all the uses of a G_EXTRACT. ; CHECK-NOT: G_SEQUENCE ; CHECK-NOT: G_EXTRACT - ; CHECK: %24(s32) = G_ADD %0, %20 - %20(s32) = G_ADD %0, %0 - %21(s64) = G_SEQUENCE %0, 0, %20, 32 - %22(s32) = G_EXTRACT %21, 0 - %23(s32) = G_EXTRACT %21, 32 - %24(s32) = G_ADD %22, %23 + ; CHECK: %3(s32) = G_MUL %0, %0 + ; CHECK: %4(s32) = G_ADD %0, %3 + %1:_(s32) = G_SEQUENCE %0, 0 + %2:_(s32) = G_EXTRACT %1, 0 + %3:_(s32) = G_MUL %2, %2 + %4:_(s32) = G_ADD %2, %3 ... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir b/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir index 56a7d4736ae8..16d9e59698fe 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -57,7 +57,7 @@ body: | ; CHECK: %0(s32) = G_FCONSTANT float 1.000000e+00 ; CHECK: %1(s64) = G_FCONSTANT double 2.000000e+00 ; CHECK: [[TMP:%[0-9]+]](s32) = G_FCONSTANT half 0xH0000 - ; CHECK; %2(s16) = G_FPTRUNC [[TMP]] + ; CHECK: %2(s16) = G_FPTRUNC [[TMP]] %0(s32) = G_FCONSTANT float 1.0 %1(s64) = G_FCONSTANT double 2.0 diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-div.mir b/test/CodeGen/AArch64/GlobalISel/legalize-div.mir index aaef45d3c928..c6e0aabfd2c0 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-div.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-div.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-exceptions.ll b/test/CodeGen/AArch64/GlobalISel/legalize-exceptions.ll new file mode 100644 index 000000000000..23e7d5163e5a --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-exceptions.ll @@ -0,0 +1,53 @@ +; RUN: llc -O0 -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel -stop-after=legalizer %s -o - | FileCheck %s + +@_ZTIi = external global i8* + +declare i32 @foo(i32) +declare i32 @__gxx_personality_v0(...) +declare i32 @llvm.eh.typeid.for(i8*) +declare void @_Unwind_Resume(i8*) + +; CHECK: name: bar +; CHECK: body: +; CHECK-NEXT: bb.1 (%ir-block.0): +; CHECK: successors: %{{bb.[0-9]+.continue.*}}%[[LP:bb.[0-9]+.cleanup]] + +; CHECK: [[LP]] (landing-pad): +; CHECK: EH_LABEL + +; CHECK: [[PTR:%[0-9]+]](p0) = COPY %x0 +; CHECK: [[STRUCT_PTR:%[0-9]+]](s64) = G_PTRTOINT [[PTR]](p0) + +; CHECK: [[SEL_PTR:%[0-9]+]](p0) = COPY %x1 +; CHECK: [[SEL:%[0-9]+]](s32) = G_PTRTOINT [[SEL_PTR]] +; CHECK: [[STRUCT_SEL:%[0-9]+]](s64) = G_INSERT {{%[0-9]+}}, [[SEL]](s32), 0 + +; CHECK: [[STRUCT:%[0-9]+]](s128) = G_MERGE_VALUES [[STRUCT_PTR]](s64), [[STRUCT_SEL]] + +; CHECK: [[PTR:%[0-9]+]](p0) = G_EXTRACT [[STRUCT]](s128), 0 +; CHECK: G_STORE [[PTR]](p0), {{%[0-9]+}}(p0) + +; CHECK: [[SEL:%[0-9]+]](s32) = G_EXTRACT [[STRUCT]](s128), 64 +; CHECK: G_STORE [[SEL]](s32), {{%[0-9]+}}(p0) + +define void @bar() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + %exn.slot = alloca i8* + %ehselector.slot = alloca i32 + %1 = invoke i32 @foo(i32 42) to label %continue unwind label %cleanup + +cleanup: + %2 = landingpad { i8*, i32 } cleanup + %3 = extractvalue { i8*, i32 } %2, 0 + store i8* %3, i8** %exn.slot, align 8 + %4 = extractvalue { i8*, i32 } %2, 1 + store i32 %4, i32* %ehselector.slot, align 4 + br label %eh.resume + +continue: + ret void + +eh.resume: + %exn = load i8*, i8** %exn.slot, align 8 + call void @_Unwind_Resume(i8* %exn) + unreachable +} diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir index 9907f009d931..70b55e4ebc66 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir index 72bd613fab3a..8cdc7b78b1e9 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-fneg.mir b/test/CodeGen/AArch64/GlobalISel/legalize-fneg.mir new file mode 100644 index 000000000000..8b5cbdfa55e3 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-fneg.mir @@ -0,0 +1,48 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--" + define void @test_fneg_f32() { + entry: + ret void + } + define void @test_fneg_f64() { + entry: + ret void + } +... +--- +name: test_fneg_f32 +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1: + liveins: %s0 + ; CHECK-LABEL: name: test_fneg_f32 + ; CHECK: [[VAR:%[0-9]+]](s32) = COPY %s0 + ; CHECK: [[ZERO:%[0-9]+]](s32) = G_FCONSTANT float -0.000000e+00 + ; CHECK: [[RES:%[0-9]+]](s32) = G_FSUB [[ZERO]], [[VAR]] + ; CHECK: %s0 = COPY [[RES]](s32) + %0(s32) = COPY %s0 + %1(s32) = G_FNEG %0 + %s0 = COPY %1(s32) +... +--- +name: test_fneg_f64 +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1: + liveins: %d0 + ; CHECK-LABEL: name: test_fneg_f64 + ; CHECK: [[VAR:%[0-9]+]](s64) = COPY %d0 + ; CHECK: [[ZERO:%[0-9]+]](s64) = G_FCONSTANT double -0.000000e+00 + ; CHECK: [[RES:%[0-9]+]](s64) = G_FSUB [[ZERO]], [[VAR]] + ; CHECK: %d0 = COPY [[RES]](s64) + %0(s64) = COPY %d0 + %1(s64) = G_FNEG %0 + %d0 = COPY %1(s64) +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir b/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir new file mode 100644 index 000000000000..f79d0382ea7c --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir @@ -0,0 +1,201 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--" + + define void @test_fptosi_s32_s32() { ret void } + define void @test_fptoui_s32_s32() { ret void } + define void @test_fptosi_s32_s64() { ret void } + define void @test_fptoui_s32_s64() { ret void } + + define void @test_fptosi_s64_s32() { ret void } + define void @test_fptoui_s64_s32() { ret void } + define void @test_fptosi_s64_s64() { ret void } + define void @test_fptoui_s64_s64() { ret void } + + define void @test_fptosi_s1_s32() { ret void } + define void @test_fptoui_s1_s32() { ret void } + + define void @test_fptosi_s8_s64() { ret void } + define void @test_fptoui_s8_s64() { ret void } + + define void @test_fptosi_s16_s32() { ret void } + define void @test_fptoui_s16_s32() { ret void } +... + +--- +name: test_fptosi_s32_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_fptosi_s32_s32 + ; CHECK: %1(s32) = G_FPTOSI %0 + %1:_(s32) = G_FPTOSI %0 +... + +--- +name: test_fptoui_s32_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_fptoui_s32_s32 + ; CHECK: %1(s32) = G_FPTOUI %0 + %1:_(s32) = G_FPTOUI %0 +... + +--- +name: test_fptosi_s32_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_fptosi_s32_s64 + ; CHECK: %1(s32) = G_FPTOSI %0 + %1:_(s32) = G_FPTOSI %0 +... + +--- +name: test_fptoui_s32_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_fptoui_s32_s64 + ; CHECK: %1(s32) = G_FPTOUI %0 + %1:_(s32) = G_FPTOUI %0 +... + +--- +name: test_fptosi_s64_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_fptosi_s64_s32 + ; CHECK: %1(s64) = G_FPTOSI %0 + %1:_(s64) = G_FPTOSI %0 +... + +--- +name: test_fptoui_s64_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_fptoui_s64_s32 + ; CHECK: %1(s64) = G_FPTOUI %0 + %1:_(s64) = G_FPTOUI %0 +... + +--- +name: test_fptosi_s64_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_fptosi_s64_s64 + ; CHECK: %1(s64) = G_FPTOSI %0 + %1:_(s64) = G_FPTOSI %0 +... + +--- +name: test_fptoui_s64_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_fptoui_s64_s64 + ; CHECK: %1(s64) = G_FPTOUI %0 + %1:_(s64) = G_FPTOUI %0 +... + + + +--- +name: test_fptosi_s1_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_fptosi_s1_s32 + ; CHECK: %2(s32) = G_FPTOSI %0 + ; CHECK: %1(s1) = G_TRUNC %2 + %1:_(s1) = G_FPTOSI %0 +... + +--- +name: test_fptoui_s1_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_fptoui_s1_s32 + ; CHECK: %2(s32) = G_FPTOUI %0 + ; CHECK: %1(s1) = G_TRUNC %2 + %1:_(s1) = G_FPTOUI %0 +... + +--- +name: test_fptosi_s8_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_fptosi_s8_s64 + ; CHECK: %2(s32) = G_FPTOSI %0 + ; CHECK: %1(s8) = G_TRUNC %2 + %1:_(s8) = G_FPTOSI %0 +... + +--- +name: test_fptoui_s8_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_fptoui_s8_s64 + ; CHECK: %2(s32) = G_FPTOUI %0 + ; CHECK: %1(s8) = G_TRUNC %2 + %1:_(s8) = G_FPTOUI %0 +... + +--- +name: test_fptosi_s16_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_fptosi_s16_s32 + ; CHECK: %2(s32) = G_FPTOSI %0 + ; CHECK: %1(s16) = G_TRUNC %2 + %1:_(s16) = G_FPTOSI %0 +... + +--- +name: test_fptoui_s16_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_fptoui_s16_s32 + ; CHECK: %2(s32) = G_FPTOUI %0 + ; CHECK: %1(s16) = G_TRUNC %2 + %1:_(s16) = G_FPTOUI %0 +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir b/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir index 3f11c123ba51..d6ec983c2067 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir new file mode 100644 index 000000000000..917f181099ec --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir @@ -0,0 +1,141 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--" + define void @test_inserts_1() { ret void } + define void @test_inserts_2() { ret void } + define void @test_inserts_3() { ret void } + define void @test_inserts_4() { ret void } + define void @test_inserts_5() { ret void } + define void @test_inserts_6() { ret void } +... + +--- +name: test_inserts_1 +body: | + bb.0: + liveins: %w0 + + ; Low part of insertion wipes out the old register entirely, so %0 gets + ; forwarded to the G_STORE. Hi part is unchanged so (split) G_LOAD gets + ; forwarded. + ; CHECK-LABEL: name: test_inserts_1 + ; CHECK: [[LO:%[0-9]+]](s64) = G_LOAD + ; CHECK: [[HI:%[0-9]+]](s64) = G_LOAD + ; CHECK: G_STORE %0(s64) + ; CHECK: G_STORE [[HI]] + %0:_(s64) = COPY %x0 + %1:_(s32) = COPY %w1 + %2:_(p0) = COPY %x2 + %3:_(s128) = G_LOAD %2(p0) :: (load 16) + %4:_(s128) = G_INSERT %3(s128), %0(s64), 0 + G_STORE %4(s128), %2(p0) :: (store 16) + RET_ReallyLR +... + +--- +name: test_inserts_2 +body: | + bb.0: + liveins: %w0 + + ; Low insertion wipes out the old register entirely, so %0 gets forwarded + ; to the G_STORE again. Second insertion is real. + ; CHECK-LABEL: name: test_inserts_2 + ; CHECK: [[LO:%[0-9]+]](s64) = G_LOAD + ; CHECK: [[HI:%[0-9]+]](s64) = G_LOAD + ; CHECK: [[NEWHI:%[0-9]+]](s64) = G_INSERT [[HI]], %1(s32), 0 + ; CHECK: G_STORE %0(s64) + ; CHECK: G_STORE [[NEWHI]] + %0:_(s64) = COPY %x0 + %1:_(s32) = COPY %w1 + %2:_(p0) = COPY %x2 + %3:_(s128) = G_LOAD %2(p0) :: (load 16) + %4:_(s128) = G_INSERT %3(s128), %0(s64), 0 + %5:_(s128) = G_INSERT %4(s128), %1(s32), 64 + G_STORE %5(s128), %2(p0) :: (store 16) + RET_ReallyLR +... + +--- +name: test_inserts_3 +body: | + bb.0: + liveins: %w0 + + ; I'm not entirely convinced inserting a p0 into an s64 is valid, but it's + ; certainly better than the alternative of directly forwarding the value + ; which would cause a nasty type mismatch. + ; CHECK-LABEL: name: test_inserts_3 + ; CHECK: [[LO:%[0-9]+]](s64) = G_LOAD + ; CHECK: [[HI:%[0-9]+]](s64) = G_LOAD + ; CHECK: [[NEWLO:%[0-9]+]](s64) = G_PTRTOINT %0(p0) + ; CHECK: G_STORE [[NEWLO]](s64) + ; CHECK: G_STORE [[HI]] + %0:_(p0) = COPY %x0 + %1:_(s32) = COPY %w1 + %2:_(p0) = COPY %x2 + %3:_(s128) = G_LOAD %2(p0) :: (load 16) + %4:_(s128) = G_INSERT %3(s128), %0(p0), 0 + G_STORE %4(s128), %2(p0) :: (store 16) + RET_ReallyLR +... + +--- +name: test_inserts_4 +body: | + bb.0: + liveins: %w0 + + ; A narrow insert gets surrounded by a G_ANYEXT/G_TRUNC pair. + ; CHECK-LABEL: name: test_inserts_4 + ; CHECK: [[VALEXT:%[0-9]+]](s32) = G_ANYEXT %1(s8) + ; CHECK: [[VAL:%[0-9]+]](s32) = G_INSERT [[VALEXT]], %0(s1), 0 + ; CHECK: %3(s8) = G_TRUNC [[VAL]](s32) + %0:_(s1) = COPY %w0 + %1:_(s8) = COPY %w1 + %2:_(p0) = COPY %x2 + %3:_(s8) = G_INSERT %1(s8), %0(s1), 0 + G_STORE %3(s8), %2(p0) :: (store 1) + RET_ReallyLR +... + +--- +name: test_inserts_5 +body: | + bb.0: + liveins: %x0, %x1, %x2 + + + ; CHECK-LABEL: name: test_inserts_5 + ; CHECK: [[INS_LO:%[0-9]+]](s32) = G_EXTRACT %2(s64), 0 + ; CHECK: [[VAL_LO:%[0-9]+]](s64) = G_INSERT %0, [[INS_LO]](s32), 32 + ; CHECK: [[INS_HI:%[0-9]+]](s32) = G_EXTRACT %2(s64), 32 + ; CHECK: [[VAL_HI:%[0-9]+]](s64) = G_INSERT %1, [[INS_HI]](s32), 0 + ; CHECK: %4(s128) = G_MERGE_VALUES [[VAL_LO]](s64), [[VAL_HI]](s64) + %0:_(s64) = COPY %x0 + %1:_(s64) = COPY %x1 + %2:_(s64) = COPY %x2 + %3:_(s128) = G_MERGE_VALUES %0, %1 + %4:_(s128) = G_INSERT %3, %2, 32 + RET_ReallyLR +... + +--- +name: test_inserts_6 +body: | + bb.0: + liveins: %x0, %x1, %x2 + + + ; CHECK-LABEL: name: test_inserts_6 + ; CHECK: [[VAL_LO:%[0-9]+]](s64) = G_INSERT %0, %2(s32), 32 + ; CHECK: %4(s128) = G_MERGE_VALUES [[VAL_LO]](s64), %1(s64) + %0:_(s64) = COPY %x0 + %1:_(s64) = COPY %x1 + %2:_(s32) = COPY %w2 + %3:_(s128) = G_MERGE_VALUES %0, %1 + %4:_(s128) = G_INSERT %3, %2, 32 + RET_ReallyLR +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir new file mode 100644 index 000000000000..69e72bcb1f38 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir @@ -0,0 +1,206 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--" + + define void @test_sitofp_s32_s32() { ret void } + define void @test_uitofp_s32_s32() { ret void } + define void @test_sitofp_s32_s64() { ret void } + define void @test_uitofp_s32_s64() { ret void } + + define void @test_sitofp_s64_s32() { ret void } + define void @test_uitofp_s64_s32() { ret void } + define void @test_sitofp_s64_s64() { ret void } + define void @test_uitofp_s64_s64() { ret void } + + define void @test_sitofp_s32_s1() { ret void } + define void @test_uitofp_s32_s1() { ret void } + + define void @test_sitofp_s64_s8() { ret void } + define void @test_uitofp_s64_s8() { ret void } + + define void @test_sitofp_s32_s16() { ret void } + define void @test_uitofp_s32_s16() { ret void } +... + +--- +name: test_sitofp_s32_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_sitofp_s32_s32 + ; CHECK: %1(s32) = G_SITOFP %0 + %1:_(s32) = G_SITOFP %0 +... + +--- +name: test_uitofp_s32_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_uitofp_s32_s32 + ; CHECK: %1(s32) = G_UITOFP %0 + %1:_(s32) = G_UITOFP %0 +... + +--- +name: test_sitofp_s32_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_sitofp_s32_s64 + ; CHECK: %1(s32) = G_SITOFP %0 + %1:_(s32) = G_SITOFP %0 +... + +--- +name: test_uitofp_s32_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_uitofp_s32_s64 + ; CHECK: %1(s32) = G_UITOFP %0 + %1:_(s32) = G_UITOFP %0 +... + +--- +name: test_sitofp_s64_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_sitofp_s64_s32 + ; CHECK: %1(s64) = G_SITOFP %0 + %1:_(s64) = G_SITOFP %0 +... + +--- +name: test_uitofp_s64_s32 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + + ; CHECK-LABEL: name: test_uitofp_s64_s32 + ; CHECK: %1(s64) = G_UITOFP %0 + %1:_(s64) = G_UITOFP %0 +... + +--- +name: test_sitofp_s64_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_sitofp_s64_s64 + ; CHECK: %1(s64) = G_SITOFP %0 + %1:_(s64) = G_SITOFP %0 +... + +--- +name: test_uitofp_s64_s64 +body: | + bb.0: + liveins: %x0 + %0:_(s64) = COPY %x0 + + ; CHECK-LABEL: name: test_uitofp_s64_s64 + ; CHECK: %1(s64) = G_UITOFP %0 + %1:_(s64) = G_UITOFP %0 +... + + +--- +name: test_sitofp_s32_s1 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + %1:_(s1) = G_TRUNC %0 + + ; CHECK-LABEL: name: test_sitofp_s32_s1 + ; CHECK: %3(s32) = G_SEXT %1 + ; CHECK: %2(s32) = G_SITOFP %3 + %2:_(s32) = G_SITOFP %1 +... + +--- +name: test_uitofp_s32_s1 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + %1:_(s1) = G_TRUNC %0 + + ; CHECK-LABEL: name: test_uitofp_s32_s1 + ; CHECK: %3(s32) = G_ZEXT %1 + ; CHECK: %2(s32) = G_UITOFP %3 + %2:_(s32) = G_UITOFP %1 +... + +--- +name: test_sitofp_s64_s8 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + %1:_(s8) = G_TRUNC %0 + + ; CHECK-LABEL: name: test_sitofp_s64_s8 + ; CHECK: %3(s32) = G_SEXT %1 + ; CHECK: %2(s64) = G_SITOFP %3 + %2:_(s64) = G_SITOFP %1 +... + +--- +name: test_uitofp_s64_s8 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + %1:_(s8) = G_TRUNC %0 + + ; CHECK-LABEL: name: test_uitofp_s64_s8 + ; CHECK: %3(s32) = G_ZEXT %1 + ; CHECK: %2(s64) = G_UITOFP %3 + %2:_(s64) = G_UITOFP %1 +... + +--- +name: test_sitofp_s32_s16 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + %1:_(s16) = G_TRUNC %0 + + ; CHECK-LABEL: name: test_sitofp_s32_s16 + ; CHECK: %3(s32) = G_SEXT %1 + ; CHECK: %2(s32) = G_SITOFP %3 + %2:_(s32) = G_SITOFP %1 +... + +--- +name: test_uitofp_s32_s16 +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + %1:_(s16) = G_TRUNC %0 + + ; CHECK-LABEL: name: test_uitofp_s32_s16 + ; CHECK: %3(s32) = G_ZEXT %1 + ; CHECK: %2(s32) = G_UITOFP %3 + %2:_(s32) = G_UITOFP %1 +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir index 6a86686fa4bd..c806b4a7060d 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -24,6 +24,7 @@ registers: - { id: 5, class: _ } - { id: 6, class: _ } - { id: 7, class: _ } + - { id: 8, class: _ } body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 @@ -51,6 +52,15 @@ body: | ; CHECK: %7(<2 x s32>) = G_LOAD %0(p0) :: (load 8 from %ir.addr) %7(<2 x s32>) = G_LOAD %0(p0) :: (load 8 from %ir.addr) + + ; CHECK: [[OFFSET0:%[0-9]+]](s64) = G_CONSTANT i64 0 + ; CHECK: [[GEP0:%[0-9]+]](p0) = G_GEP %0, [[OFFSET0]](s64) + ; CHECK: [[LOAD0:%[0-9]+]](s64) = G_LOAD [[GEP0]](p0) :: (load 16 from %ir.addr) + ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_CONSTANT i64 8 + ; CHECK: [[GEP1:%[0-9]+]](p0) = G_GEP %0, [[OFFSET1]](s64) + ; CHECK: [[LOAD1:%[0-9]+]](s64) = G_LOAD [[GEP1]](p0) :: (load 16 from %ir.addr) + ; CHECK: %8(s128) = G_MERGE_VALUES [[LOAD0]](s64), [[LOAD1]](s64) + %8(s128) = G_LOAD %0(p0) :: (load 16 from %ir.addr) ... --- @@ -62,6 +72,8 @@ registers: - { id: 3, class: _ } - { id: 4, class: _ } - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 @@ -70,7 +82,7 @@ body: | %0(p0) = COPY %x0 %1(s32) = COPY %w1 - ; CHECK: [[BIT8:%[0-9]+]](s8) = G_ANYEXT %2(s1) + ; CHECK: [[BIT8:%[0-9]+]](s8) = G_ZEXT %2(s1) ; CHECK: G_STORE [[BIT8]](s8), %0(p0) :: (store 1 into %ir.addr) %2(s1) = G_TRUNC %1 G_STORE %2, %0 :: (store 1 into %ir.addr) @@ -92,4 +104,14 @@ body: | ; CHECK: G_STORE %0(p0), %0(p0) :: (store 8 into %ir.addr) G_STORE %0(p0), %0(p0) :: (store 8 into %ir.addr) + + ; CHECK: [[OFFSET0:%[0-9]+]](s64) = G_CONSTANT i64 0 + ; CHECK: [[GEP0:%[0-9]+]](p0) = G_GEP %0, [[OFFSET0]](s64) + ; CHECK: G_STORE %5(s64), [[GEP0]](p0) :: (store 16 into %ir.addr) + ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_CONSTANT i64 8 + ; CHECK: [[GEP1:%[0-9]+]](p0) = G_GEP %0, [[OFFSET1]](s64) + ; CHECK: G_STORE %6(s64), [[GEP1]](p0) :: (store 16 into %ir.addr) + %6(s64) = G_PTRTOINT %0(p0) + %7(s128) = G_MERGE_VALUES %5, %6 + G_STORE %7, %0 :: (store 16 into %ir.addr) ... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir index eb642d4b1a74..1ea6e9c292f5 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -7,6 +7,7 @@ entry: ret void } + define void @test_mul_overflow() { ret void } ... --- @@ -22,7 +23,10 @@ body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 ; CHECK-LABEL: name: test_scalar_mul_small - ; CHECK: %4(s8) = G_MUL %2, %3 + ; CHECK: [[OP0:%.*]](s32) = G_ANYEXT %2(s8) + ; CHECK: [[OP1:%.*]](s32) = G_ANYEXT %3(s8) + ; CHECK: [[RES32:%.*]](s32) = G_MUL [[OP0]], [[OP1]] + ; CHECK: [[RES:%.*]](s8) = G_TRUNC [[RES32]](s32) %0(s64) = COPY %x0 %1(s64) = COPY %x1 @@ -32,3 +36,22 @@ body: | %5(s64) = G_ANYEXT %2 %x0 = COPY %5 ... + + +--- +name: test_mul_overflow +body: | + bb.0: + liveins: %x0, %x1, %w2, %w3 + + %0:_(s64) = COPY %x0 + %1:_(s64) = COPY %x1 + + ; CHECK-LABEL: name: test_mul_overflow + ; CHECK: %2(s64) = G_MUL %0, %1 + ; CHECK: [[HI:%[0-9]+]](s64) = G_SMULH %0, %1 + ; CHECK: [[ZERO:%[0-9]+]](s64) = G_CONSTANT i64 0 + ; CHECK: %3(s1) = G_ICMP intpred(ne), [[HI]](s64), [[ZERO]] + %2:_(s64), %3:_(s1) = G_SMULO %0, %1 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir b/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir new file mode 100644 index 000000000000..9928ea54d2c9 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir @@ -0,0 +1,29 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--" + define void @test_legalize_merge_v3s32() { + ret void + } +... +--- +name: test_legalize_merge_v3s32 +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %w0, %w1, %w2 + ; CHECK-LABEL: name: test_legalize_merge_v3s32 + ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0 + ; CHECK: [[ARG2:%[0-9]+]](s32) = COPY %w1 + ; CHECK: [[ARG3:%[0-9]+]](s32) = COPY %w2 + ; CHECK: (<3 x s32>) = G_MERGE_VALUES [[ARG1]](s32), [[ARG2]](s32), [[ARG3]](s32) + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = COPY %w2 + %3(<3 x s32>) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32) +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-or.mir b/test/CodeGen/AArch64/GlobalISel/legalize-or.mir index edf10cd411eb..e8b850982460 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-or.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-or.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -22,7 +22,10 @@ body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 ; CHECK-LABEL: name: test_scalar_or_small - ; CHECK: %4(s8) = G_OR %2, %3 + ; CHECK: [[OP0:%.*]](s32) = G_ANYEXT %2(s8) + ; CHECK: [[OP1:%.*]](s32) = G_ANYEXT %3(s8) + ; CHECK: [[RES32:%.*]](s32) = G_OR [[OP0]], [[OP1]] + ; CHECK: [[RES:%.*]](s8) = G_TRUNC [[RES32]](s32) %0(s64) = COPY %x0 %1(s64) = COPY %x1 diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir b/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir new file mode 100644 index 000000000000..2becc2e134b5 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir @@ -0,0 +1,38 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--" + define void @test_pow() { + entry: + ret void + } +... + +--- +name: test_pow +body: | + bb.0.entry: + liveins: %d0, %d1, %s2, %s3 + + ; CHECK-LABEL: name: test_pow + ; CHECK: hasCalls: true + + %0:_(s64) = COPY %d0 + %1:_(s64) = COPY %d1 + %2:_(s32) = COPY %s2 + %3:_(s32) = COPY %s3 + + ; CHECK: %d0 = COPY %0 + ; CHECK: %d1 = COPY %1 + ; CHECK: BL $pow, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %d0, implicit %d1, implicit-def %d0 + ; CHECK: %4(s64) = COPY %d0 + %4:_(s64) = G_FPOW %0, %1 + + ; CHECK: %s0 = COPY %2 + ; CHECK: %s1 = COPY %3 + ; CHECK: BL $powf, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %s0, implicit %s1, implicit-def %s0 + ; CHECK: %5(s32) = COPY %s0 + %5:_(s32) = G_FPOW %2, %3 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir b/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir index e77f3487609f..50a4d93cbe20 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -45,8 +45,15 @@ body: | ; CHECK: [[RHS32:%[0-9]+]](s32) = G_SEXT %7 ; CHECK: [[QUOT32:%[0-9]+]](s32) = G_SDIV [[LHS32]], [[RHS32]] ; CHECK: [[QUOT:%[0-9]+]](s8) = G_TRUNC [[QUOT32]] - ; CHECK: [[PROD:%[0-9]+]](s8) = G_MUL [[QUOT]], %7 - ; CHECK: [[RES:%[0-9]+]](s8) = G_SUB %6, [[PROD]] + + ; CHECK: [[QUOT32_2:%.*]](s32) = G_ANYEXT [[QUOT]](s8) + ; CHECK: [[RHS32_2:%.*]](s32) = G_ANYEXT %7(s8) + ; CHECK: [[PROD32:%.*]](s32) = G_MUL [[QUOT32_2]], [[RHS32_2]] + ; CHECK: [[PROD:%.*]](s8) = G_TRUNC [[PROD32]](s32) + + ; CHECK: [[LHS32_2:%.*]](s32) = G_ANYEXT %6(s8) + ; CHECK: [[PROD32_2:%.*]](s32) = G_ANYEXT [[PROD]](s8) + ; CHECK: [[RES:%[0-9]+]](s32) = G_SUB [[LHS32_2]], [[PROD32_2]] %6(s8) = G_TRUNC %0 %7(s8) = G_TRUNC %1 %8(s8) = G_SREM %6, %7 diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir b/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir new file mode 100644 index 000000000000..f75a2982a3f2 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir @@ -0,0 +1,47 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--" + define void @test_shift() { + entry: + ret void + } +... + +--- +name: test_shift +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } +body: | + bb.0.entry: + liveins: %x0, %x1, %x2, %x3 + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s8) = G_TRUNC %0 + %3(s8) = G_TRUNC %1 + + ; CHECK: [[LHS32:%[0-9]+]](s32) = G_SEXT %2 + ; CHECK: [[RHS32:%[0-9]+]](s32) = G_SEXT %3 + ; CHECK: [[RES32:%[0-9]+]](s32) = G_ASHR [[LHS32]], [[RHS32]] + ; CHECK: %4(s8) = G_TRUNC [[RES32]] + %4(s8) = G_ASHR %2, %3 + + ; CHECK: [[LHS32:%[0-9]+]](s32) = G_ZEXT %2 + ; CHECK: [[RHS32:%[0-9]+]](s32) = G_ZEXT %3 + ; CHECK: [[RES32:%[0-9]+]](s32) = G_LSHR [[LHS32]], [[RHS32]] + ; CHECK: %5(s8) = G_TRUNC [[RES32]] + %5(s8) = G_LSHR %2, %3 + + ; CHECK: [[OP0:%.*]](s32) = G_ANYEXT %2(s8) + ; CHECK: [[OP1:%.*]](s32) = G_ANYEXT %3(s8) + ; CHECK: [[RES32:%.*]](s32) = G_SHL [[OP0]], [[OP1]] + ; CHECK: [[RES:%.*]](s8) = G_TRUNC [[RES32]](s32) + %6(s8) = G_SHL %2, %3 +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir b/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir index 41a9c33bfad8..cd24bccfe771 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -31,103 +31,56 @@ registers: - { id: 14, class: _ } - { id: 15, class: _ } - { id: 16, class: _ } - - { id: 17, class: _ } - - { id: 18, class: _ } - - { id: 19, class: _ } - - { id: 20, class: _ } - - { id: 21, class: _ } - - { id: 22, class: _ } - - { id: 23, class: _ } - - { id: 24, class: _ } - - { id: 25, class: _ } - - { id: 26, class: _ } - - { id: 27, class: _ } - - { id: 28, class: _ } - - { id: 29, class: _ } - - { id: 30, class: _ } - - { id: 31, class: _ } - - { id: 32, class: _ } - - { id: 33, class: _ } - - { id: 34, class: _ } body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 %0(s64) = COPY %x0 + %1(s1) = G_TRUNC %0 + %2(s8) = G_TRUNC %0 + %3(s16) = G_TRUNC %0 + %4(s32) = G_TRUNC %0 + ; CHECK-LABEL: name: test_simple - ; CHECK: %1(p0) = G_INTTOPTR %0 - ; CHECK: %2(s64) = G_PTRTOINT %1 - %1(p0) = G_INTTOPTR %0 - %2(s64) = G_PTRTOINT %1 + ; CHECK: %5(p0) = G_INTTOPTR %0 + ; CHECK: %6(s64) = G_PTRTOINT %5 + %5(p0) = G_INTTOPTR %0 + %6(s64) = G_PTRTOINT %5 - ; CHECK: G_BRCOND %3(s1), %bb.1.next - %3(s1) = G_TRUNC %0 - G_BRCOND %3, %bb.1.next + ; CHECK: G_BRCOND %1(s1), %bb.1.next + G_BRCOND %1, %bb.1.next bb.1.next: - %4(s32) = G_TRUNC %0 - - ; CHECK: %5(s1) = G_FPTOSI %4 - ; CHECK: %6(s8) = G_FPTOUI %4 - ; CHECK: %7(s16) = G_FPTOSI %4 - ; CHECK: %8(s32) = G_FPTOUI %4 - ; CHECK: %9(s64) = G_FPTOSI %4 - %5(s1) = G_FPTOSI %4 - %6(s8) = G_FPTOUI %4 - %7(s16) = G_FPTOSI %4 - %8(s32) = G_FPTOUI %4 - %9(s64) = G_FPTOSI %4 - ; CHECK: %10(s1) = G_FPTOUI %0 - ; CHECK: %11(s8) = G_FPTOSI %0 - ; CHECK: %12(s16) = G_FPTOUI %0 - ; CHECK: %13(s32) = G_FPTOSI %0 - ; CHECK: %14(s32) = G_FPTOUI %0 - %10(s1) = G_FPTOUI %0 - %11(s8) = G_FPTOSI %0 - %12(s16) = G_FPTOUI %0 - %13(s32) = G_FPTOSI %0 - %14(s32) = G_FPTOUI %0 + ; CHECK: [[LHS:%[0-9]+]](s32) = G_ANYEXT %1(s1) + ; CHECK: [[RHS:%[0-9]+]](s32) = G_ANYEXT %1(s1) + ; CHECK: [[RES:%[0-9]+]](s32) = G_SELECT %1(s1), [[LHS]], [[RHS]] + ; CHECK: %7(s1) = G_TRUNC [[RES]](s32) + %7(s1) = G_SELECT %1, %1, %1 - ; CHECK: %15(s32) = G_UITOFP %5 - ; CHECK: %16(s32) = G_SITOFP %11 - ; CHECK: %17(s32) = G_UITOFP %7 - ; CHECK: %18(s32) = G_SITOFP %4 - ; CHECK: %19(s32) = G_UITOFP %0 - %15(s32) = G_UITOFP %5 - %16(s32) = G_SITOFP %11 - %17(s32) = G_UITOFP %7 - %18(s32) = G_SITOFP %4 - %19(s32) = G_UITOFP %0 + ; CHECK: [[LHS:%[0-9]+]](s32) = G_ANYEXT %2(s8) + ; CHECK: [[RHS:%[0-9]+]](s32) = G_ANYEXT %2(s8) + ; CHECK: [[RES:%[0-9]+]](s32) = G_SELECT %1(s1), [[LHS]], [[RHS]] + ; CHECK: %8(s8) = G_TRUNC [[RES]](s32) + %8(s8) = G_SELECT %1, %2, %2 - ; CHECK: %20(s64) = G_SITOFP %5 - ; CHECK: %21(s64) = G_UITOFP %11 - ; CHECK: %22(s64) = G_SITOFP %7 - ; CHECK: %23(s64) = G_UITOFP %4 - ; CHECK: %24(s64) = G_SITOFP %0 - %20(s64) = G_SITOFP %5 - %21(s64) = G_UITOFP %11 - %22(s64) = G_SITOFP %7 - %23(s64) = G_UITOFP %4 - %24(s64) = G_SITOFP %0 + ; CHECK: [[LHS:%[0-9]+]](s32) = G_ANYEXT %3(s16) + ; CHECK: [[RHS:%[0-9]+]](s32) = G_ANYEXT %3(s16) + ; CHECK: [[RES:%[0-9]+]](s32) = G_SELECT %1(s1), [[LHS]], [[RHS]] + ; CHECK: %9(s16) = G_TRUNC [[RES]](s32) + %9(s16) = G_SELECT %1, %3, %3 - ; CHECK: %25(s1) = G_SELECT %10(s1), %10, %5 - ; CHECK: %26(s8) = G_SELECT %10(s1), %6, %11 - ; CHECK: %27(s16) = G_SELECT %10(s1), %12, %7 - ; CHECK: %28(s32) = G_SELECT %10(s1), %15, %16 - ; CHECK: %29(s64) = G_SELECT %10(s1), %9, %24 - %25(s1) = G_SELECT %10, %10, %5 - %26(s8) = G_SELECT %10, %6, %11 - %27(s16) = G_SELECT %10, %12, %7 - %28(s32) = G_SELECT %10, %15, %16 - %29(s64) = G_SELECT %10, %9, %24 + %10(s32) = G_SELECT %1, %4, %4 + %11(s64) = G_SELECT %1, %0, %0 - ; CHECK: %30(<2 x s32>) = G_BITCAST %9 - ; CHECK: %31(s64) = G_BITCAST %30 - ; CHECK: %32(s32) = G_BITCAST %15 - %30(<2 x s32>) = G_BITCAST %9 - %31(s64) = G_BITCAST %30 - %32(s32) = G_BITCAST %15 - %33(<4 x s8>) = G_BITCAST %15 - %34(<2 x s16>) = G_BITCAST %15 + ; CHECK: %12(<2 x s32>) = G_BITCAST %0 + ; CHECK: %13(s64) = G_BITCAST %12 + ; CHECK: %14(s32) = G_BITCAST %10 + ; CHECK: %15(<4 x s8>) = G_BITCAST %0 + ; CHECK: %16(<2 x s16>) = G_BITCAST %0 + %12(<2 x s32>) = G_BITCAST %0 + %13(s64) = G_BITCAST %12 + %14(s32) = G_BITCAST %10 + %15(<4 x s8>) = G_BITCAST %0 + %16(<2 x s16>) = G_BITCAST %0 ... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir b/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir index e5403cb73c37..82a1dd09c1a1 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -22,7 +22,10 @@ body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 ; CHECK-LABEL: name: test_scalar_sub_small - ; CHECK: [[RES:%.*]](s8) = G_SUB %2, %3 + ; CHECK: [[OP0:%.*]](s32) = G_ANYEXT %2(s8) + ; CHECK: [[OP1:%.*]](s32) = G_ANYEXT %3(s8) + ; CHECK: [[RES32:%.*]](s32) = G_SUB [[OP0]], [[OP1]] + ; CHECK: [[RES:%.*]](s8) = G_TRUNC [[RES32]](s32) %0(s64) = COPY %x0 %1(s64) = COPY %x1 diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir b/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir new file mode 100644 index 000000000000..8bda08d0a1d1 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir @@ -0,0 +1,39 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--" + define void @test_vaarg() { ret void } +... + +--- +name: test_vaarg +body: | + bb.0: + %0:_(p0) = COPY %x0 + + ; CHECK-LABEL: name: test_vaarg + ; CHECK: [[LIST:%[0-9]+]](p0) = G_LOAD %0(p0) :: (load 8) + ; CHECK: %1(s8) = G_LOAD [[LIST]](p0) :: (load 1, align 8) + ; CHECK: [[SLOTSIZE:%[0-9]+]](s64) = G_CONSTANT i64 8 + ; CHECK: [[NEXT:%[0-9]+]](p0) = G_GEP [[LIST]], [[SLOTSIZE]](s64) + ; CHECK: G_STORE [[NEXT]](p0), %0(p0) :: (store 8) + %1:_(s8) = G_VAARG %0(p0), 1 + + ; CHECK: [[LIST:%[0-9]+]](p0) = G_LOAD %0(p0) :: (load 8) + ; CHECK: %2(s64) = G_LOAD [[LIST]](p0) :: (load 8) + ; CHECK: [[SLOTSIZE:%[0-9]+]](s64) = G_CONSTANT i64 8 + ; CHECK: [[NEXT:%[0-9]+]](p0) = G_GEP [[LIST]], [[SLOTSIZE]](s64) + ; CHECK: G_STORE [[NEXT]](p0), %0(p0) :: (store 8) + %2:_(s64) = G_VAARG %0(p0), 8 + + ; CHECK: [[LIST:%[0-9]+]](p0) = G_LOAD %0(p0) :: (load 8) + ; CHECK: [[ALIGNM1:%[0-9]+]](s64) = G_CONSTANT i64 15 + ; CHECK: [[ALIGNTMP:%[0-9]+]](p0) = G_GEP [[LIST]], [[ALIGNM1]](s64) + ; CHECK: [[LIST:%[0-9]+]](p0) = G_PTR_MASK [[ALIGNTMP]], 4 + ; CHECK: %3(s64) = G_LOAD [[LIST]](p0) :: (load 8, align 16) + ; CHECK: [[SLOTSIZE:%[0-9]+]](s64) = G_CONSTANT i64 8 + ; CHECK: [[NEXT:%[0-9]+]](p0) = G_GEP [[LIST]], [[SLOTSIZE]](s64) + ; CHECK: G_STORE [[NEXT]](p0), %0(p0) :: (store 8) + %3:_(s64) = G_VAARG %0(p0), 16 +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir b/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir index 919e674965c0..460b3d16f1c0 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir @@ -1,4 +1,4 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s +# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -22,7 +22,10 @@ body: | bb.0.entry: liveins: %x0, %x1, %x2, %x3 ; CHECK-LABEL: name: test_scalar_xor_small - ; CHECK: %4(s8) = G_XOR %2, %3 + ; CHECK: [[OP0:%.*]](s32) = G_ANYEXT %2(s8) + ; CHECK: [[OP1:%.*]](s32) = G_ANYEXT %3(s8) + ; CHECK: [[RES32:%.*]](s32) = G_XOR [[OP0]], [[OP1]] + ; CHECK: [[RES:%.*]](s8) = G_TRUNC [[RES32]](s32) %0(s64) = COPY %x0 %1(s64) = COPY %x1 diff --git a/test/CodeGen/AArch64/GlobalISel/no-regclass.mir b/test/CodeGen/AArch64/GlobalISel/no-regclass.mir new file mode 100644 index 000000000000..6832ce0ee8bd --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/no-regclass.mir @@ -0,0 +1,30 @@ +# RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -start-before=legalizer -stop-after=instruction-select %s -o - | FileCheck %s + +# We run the legalizer to combine the trivial EXTRACT_SEQ pair, leaving %1 and +# %2 orphaned after instruction-selection (no instructions define or use +# them). This shouldn't be a problem. + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @unused_reg() { ret void } + +--- +# CHECK-LABEL: name: unused_reg +name: unused_reg +legalized: true +regBankSelected: true +tracksRegLiveness: true + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %w0 = COPY %0 + +body: | + bb.0: + liveins: %w0 + %0:gpr(s32) = COPY %w0 + %1:gpr(s32) = G_SEQUENCE %0(s32), 0 + %2:gpr(s32) = G_EXTRACT %1(s32), 0 + %w0 = COPY %2(s32) +... diff --git a/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir b/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir new file mode 100644 index 000000000000..73d4d2054729 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir @@ -0,0 +1,45 @@ +# RUN: llc -O0 -mtriple arm64-- -run-pass=regbankselect -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @test_dbg_value() !dbg !5 { + ; Keep the dbg metadata live by referencing it in the IR. + call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !7, metadata !9), !dbg !10 + ret void + } + + declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!3, !4} + + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "llvm", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) + !1 = !DIFile(filename: "test.ll", directory: "/tmp") + !2 = !{} + !3 = !{i32 2, !"Dwarf Version", i32 4} + !4 = !{i32 2, !"Debug Info Version", i32 3} + !5 = distinct !DISubprogram(name: "test_dbg_value", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) + !6 = !DISubroutineType(types: !2) + !7 = !DILocalVariable(name: "in", arg: 1, scope: !5, file: !1, line: 1, type: !8) + !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !9 = !DIExpression() + !10 = !DILocation(line: 1, column: 1, scope: !5) +... + +--- +# CHECK-LABEL: name: test_dbg_value +name: test_dbg_value +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +body: | + bb.0: + liveins: %w0 + %0:_(s32) = COPY %w0 + ; CHECK: DBG_VALUE debug-use %0(s32), debug-use _, !7, !9, debug-location !10 + DBG_VALUE debug-use %0(s32), debug-use _, !7, !9, debug-location !10 + + ; CHECK: DBG_VALUE _, 0, !7, !9, debug-location !10 + DBG_VALUE _, 0, !7, !9, debug-location !10 +... diff --git a/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir b/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir index 12162eb54a83..14ee40c941bf 100644 --- a/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir +++ b/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir @@ -622,7 +622,7 @@ body: | ; CHECK: %0(p0) = COPY %x0 ; CHECK: %1(s32) = G_LOAD %0 %0(p0) = COPY %x0 - %1(s32) = G_LOAD %0 + %1(s32) = G_LOAD %0 :: (load 4) ... --- @@ -643,7 +643,7 @@ body: | ; CHECK: G_STORE %1(s32), %0(p0) %0(p0) = COPY %x0 %1(s32) = COPY %w1 - G_STORE %1, %0 + G_STORE %1, %0 :: (store 4) ... --- diff --git a/test/CodeGen/AArch64/GlobalISel/regbankselect-reg_sequence.mir b/test/CodeGen/AArch64/GlobalISel/regbankselect-reg_sequence.mir new file mode 100644 index 000000000000..15ccf1f5459c --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/regbankselect-reg_sequence.mir @@ -0,0 +1,25 @@ +# RUN: llc %s -mtriple aarch64-- -o - -run-pass regbankselect | FileCheck %s +--- | + define void @foo() { ret void } +... +--- +# CHECK-LABEL: foo +# Check that we produce a valid mapping for REG_SEQUENCE. +# This used to fail the RegisterBankInfo verify because +# we were using the exclusively the type of the definition +# whereas since REG_SEQUENCE are kind of target opcode +# their definition may not have a type. +# +# CHECK: id: 0, class: dd +name: foo +legalized: true +tracksRegLiveness: true +registers: + - { id: 0, class: dd } +body: | + bb.0: + liveins: %d0, %d1 + + %0 = REG_SEQUENCE %d0, %subreg.dsub0, %d1, %subreg.dsub1 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-binop.mir b/test/CodeGen/AArch64/GlobalISel/select-binop.mir new file mode 100644 index 000000000000..8ae2e1b2eb7d --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-binop.mir @@ -0,0 +1,1042 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @add_s32_gpr() { ret void } + define void @add_s64_gpr() { ret void } + + define void @add_imm_s32_gpr() { ret void } + define void @add_imm_s64_gpr() { ret void } + + define void @add_imm_s32_gpr_bb() { ret void } + + define void @sub_s32_gpr() { ret void } + define void @sub_s64_gpr() { ret void } + + define void @or_s32_gpr() { ret void } + define void @or_s64_gpr() { ret void } + define void @or_v2s32_fpr() { ret void } + + define void @and_s32_gpr() { ret void } + define void @and_s64_gpr() { ret void } + + define void @shl_s32_gpr() { ret void } + define void @shl_s64_gpr() { ret void } + + define void @lshr_s32_gpr() { ret void } + define void @lshr_s64_gpr() { ret void } + + define void @ashr_s32_gpr() { ret void } + define void @ashr_s64_gpr() { ret void } + + define void @mul_s32_gpr() { ret void } + define void @mul_s64_gpr() { ret void } + + define void @mulh_s64_gpr() { ret void } + + define void @sdiv_s32_gpr() { ret void } + define void @sdiv_s64_gpr() { ret void } + + define void @udiv_s32_gpr() { ret void } + define void @udiv_s64_gpr() { ret void } + + define void @fadd_s32_fpr() { ret void } + define void @fadd_s64_fpr() { ret void } + + define void @fsub_s32_fpr() { ret void } + define void @fsub_s64_fpr() { ret void } + + define void @fmul_s32_fpr() { ret void } + define void @fmul_s64_fpr() { ret void } + + define void @fdiv_s32_fpr() { ret void } + define void @fdiv_s64_fpr() { ret void } + +... + +--- +# Check that we select a 32-bit GPR G_ADD into ADDWrr on GPR32. +# Also check that we constrain the register class of the COPY to GPR32. +# CHECK-LABEL: name: add_s32_gpr +name: add_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = ADDWrr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_ADD %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s32_gpr, for 64-bit operations. +# CHECK-LABEL: name: add_s64_gpr +name: add_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = ADDXrr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_ADD %0, %1 + %x0 = COPY %2(s64) +... + +--- +# CHECK-LABEL: name: add_imm_s32_gpr +name: add_imm_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr32sp } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %2 = ADDWri %0, 1, 0 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = G_CONSTANT i32 1 + %2(s32) = G_ADD %0, %1 + %w0 = COPY %2(s32) +... + +--- +# CHECK-LABEL: name: add_imm_s64_gpr +name: add_imm_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr64sp } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %2 = ADDXri %0, 1, 0 +body: | + bb.0: + liveins: %x0, %w1 + + %0(s64) = COPY %x0 + %1(s64) = G_CONSTANT i32 1 + %2(s64) = G_ADD %0, %1 + %x0 = COPY %2(s64) +... + +--- +# CHECK-LABEL: name: add_imm_s32_gpr_bb +name: add_imm_s32_gpr_bb +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr32sp } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: bb.1: +# CHECK: %2 = ADDWri %0, 1, 0 +body: | + bb.0: + liveins: %w0, %w1 + successors: %bb.1 + + %0(s32) = COPY %w0 + %1(s32) = G_CONSTANT i32 1 + G_BR %bb.1 + + bb.1: + %2(s32) = G_ADD %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s32_gpr, for G_SUB operations. +# CHECK-LABEL: name: sub_s32_gpr +name: sub_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = SUBSWrr %0, %1, implicit-def %nzcv +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_SUB %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s64_gpr, for G_SUB operations. +# CHECK-LABEL: name: sub_s64_gpr +name: sub_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = SUBSXrr %0, %1, implicit-def %nzcv +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_SUB %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Same as add_s32_gpr, for G_OR operations. +# CHECK-LABEL: name: or_s32_gpr +name: or_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = ORRWrr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_OR %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s64_gpr, for G_OR operations. +# CHECK-LABEL: name: or_s64_gpr +name: or_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = ORRXrr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_OR %0, %1 + %x0 = COPY %2(s64) +... + +--- +# 64-bit G_OR on vector registers. +# CHECK-LABEL: name: or_v2s32_fpr +name: or_v2s32_fpr +legalized: true +regBankSelected: true +# +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +# CHECK-NEXT: - { id: 2, class: fpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = COPY %d1 +# The actual OR does not matter as long as it is operating +# on 64-bit width vector. +# CHECK: %2 = ORRv8i8 %0, %1 +body: | + bb.0: + liveins: %d0, %d1 + + %0(<2 x s32>) = COPY %d0 + %1(<2 x s32>) = COPY %d1 + %2(<2 x s32>) = G_OR %0, %1 + %d0 = COPY %2(<2 x s32>) +... + +--- +# Same as add_s32_gpr, for G_AND operations. +# CHECK-LABEL: name: and_s32_gpr +name: and_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = ANDWrr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_AND %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s64_gpr, for G_AND operations. +# CHECK-LABEL: name: and_s64_gpr +name: and_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = ANDXrr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_AND %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Same as add_s32_gpr, for G_SHL operations. +# CHECK-LABEL: name: shl_s32_gpr +name: shl_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = LSLVWr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_SHL %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s64_gpr, for G_SHL operations. +# CHECK-LABEL: name: shl_s64_gpr +name: shl_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = LSLVXr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_SHL %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Same as add_s32_gpr, for G_LSHR operations. +# CHECK-LABEL: name: lshr_s32_gpr +name: lshr_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = LSRVWr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_LSHR %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s64_gpr, for G_LSHR operations. +# CHECK-LABEL: name: lshr_s64_gpr +name: lshr_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = LSRVXr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_LSHR %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Same as add_s32_gpr, for G_ASHR operations. +# CHECK-LABEL: name: ashr_s32_gpr +name: ashr_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = ASRVWr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_ASHR %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s64_gpr, for G_ASHR operations. +# CHECK-LABEL: name: ashr_s64_gpr +name: ashr_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = ASRVXr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_ASHR %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Check that we select s32 GPR G_MUL. This is trickier than other binops because +# there is only MADDWrrr, and we have to use the WZR physreg. +# CHECK-LABEL: name: mul_s32_gpr +name: mul_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = MADDWrrr %0, %1, %wzr +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_MUL %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as mul_s32_gpr for the s64 type. +# CHECK-LABEL: name: mul_s64_gpr +name: mul_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = MADDXrrr %0, %1, %xzr +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_MUL %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Same as mul_s32_gpr for the s64 type. +# CHECK-LABEL: name: mulh_s64_gpr +name: mulh_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +# CHECK-NEXT: - { id: 3, class: gpr64 } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = SMULHrr %0, %1 +# CHECK: %3 = UMULHrr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0:gpr(s64) = COPY %x0 + %1:gpr(s64) = COPY %x1 + %2:gpr(s64) = G_SMULH %0, %1 + %3:gpr(s64) = G_UMULH %0, %1 + %x0 = COPY %2(s64) + %x0 = COPY %3(s64) +... + +--- +# Same as add_s32_gpr, for G_SDIV operations. +# CHECK-LABEL: name: sdiv_s32_gpr +name: sdiv_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = SDIVWr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_SDIV %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s64_gpr, for G_SDIV operations. +# CHECK-LABEL: name: sdiv_s64_gpr +name: sdiv_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = SDIVXr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_SDIV %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Same as add_s32_gpr, for G_UDIV operations. +# CHECK-LABEL: name: udiv_s32_gpr +name: udiv_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = UDIVWr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_UDIV %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as add_s64_gpr, for G_UDIV operations. +# CHECK-LABEL: name: udiv_s64_gpr +name: udiv_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = UDIVXr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_UDIV %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Check that we select a s32 FPR G_FADD into FADDSrr. +# CHECK-LABEL: name: fadd_s32_fpr +name: fadd_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: fpr32 } +# CHECK-NEXT: - { id: 2, class: fpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = COPY %s1 +# CHECK: %2 = FADDSrr %0, %1 +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + %1(s32) = COPY %s1 + %2(s32) = G_FADD %0, %1 + %s0 = COPY %2(s32) +... + +--- +# CHECK-LABEL: name: fadd_s64_fpr +name: fadd_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +# CHECK-NEXT: - { id: 2, class: fpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = COPY %d1 +# CHECK: %2 = FADDDrr %0, %1 +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + %1(s64) = COPY %d1 + %2(s64) = G_FADD %0, %1 + %d0 = COPY %2(s64) +... + +--- +# CHECK-LABEL: name: fsub_s32_fpr +name: fsub_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: fpr32 } +# CHECK-NEXT: - { id: 2, class: fpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = COPY %s1 +# CHECK: %2 = FSUBSrr %0, %1 +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + %1(s32) = COPY %s1 + %2(s32) = G_FSUB %0, %1 + %s0 = COPY %2(s32) +... + +--- +# CHECK-LABEL: name: fsub_s64_fpr +name: fsub_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +# CHECK-NEXT: - { id: 2, class: fpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = COPY %d1 +# CHECK: %2 = FSUBDrr %0, %1 +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + %1(s64) = COPY %d1 + %2(s64) = G_FSUB %0, %1 + %d0 = COPY %2(s64) +... + +--- +# CHECK-LABEL: name: fmul_s32_fpr +name: fmul_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: fpr32 } +# CHECK-NEXT: - { id: 2, class: fpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = COPY %s1 +# CHECK: %2 = FMULSrr %0, %1 +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + %1(s32) = COPY %s1 + %2(s32) = G_FMUL %0, %1 + %s0 = COPY %2(s32) +... + +--- +# CHECK-LABEL: name: fmul_s64_fpr +name: fmul_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +# CHECK-NEXT: - { id: 2, class: fpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = COPY %d1 +# CHECK: %2 = FMULDrr %0, %1 +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + %1(s64) = COPY %d1 + %2(s64) = G_FMUL %0, %1 + %d0 = COPY %2(s64) +... + +--- +# CHECK-LABEL: name: fdiv_s32_fpr +name: fdiv_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: fpr32 } +# CHECK-NEXT: - { id: 2, class: fpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = COPY %s1 +# CHECK: %2 = FDIVSrr %0, %1 +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + %1(s32) = COPY %s1 + %2(s32) = G_FDIV %0, %1 + %s0 = COPY %2(s32) +... + +--- +# CHECK-LABEL: name: fdiv_s64_fpr +name: fdiv_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +# CHECK-NEXT: - { id: 2, class: fpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = COPY %d1 +# CHECK: %2 = FDIVDrr %0, %1 +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + %1(s64) = COPY %d1 + %2(s64) = G_FDIV %0, %1 + %d0 = COPY %2(s64) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-bitcast.mir b/test/CodeGen/AArch64/GlobalISel/select-bitcast.mir new file mode 100644 index 000000000000..5ca63dbc214d --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-bitcast.mir @@ -0,0 +1,212 @@ +# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @bitcast_s32_gpr() { ret void } + define void @bitcast_s32_fpr() { ret void } + define void @bitcast_s32_gpr_fpr() { ret void } + define void @bitcast_s32_fpr_gpr() { ret void } + define void @bitcast_s64_gpr() { ret void } + define void @bitcast_s64_fpr() { ret void } + define void @bitcast_s64_gpr_fpr() { ret void } + define void @bitcast_s64_fpr_gpr() { ret void } +... + +--- +# CHECK-LABEL: name: bitcast_s32_gpr +name: bitcast_s32_gpr +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32all } +# CHECK-NEXT: - { id: 1, class: gpr32all } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s32) = G_BITCAST %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: bitcast_s32_fpr +name: bitcast_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: fpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %s0 + + %0(s32) = COPY %s0 + %1(s32) = G_BITCAST %0 + %s0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: bitcast_s32_gpr_fpr +name: bitcast_s32_gpr_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32all } +# CHECK-NEXT: - { id: 1, class: fpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s32) = G_BITCAST %0 + %s0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: bitcast_s32_fpr_gpr +name: bitcast_s32_fpr_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32all } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %s0 + + %0(s32) = COPY %s0 + %1(s32) = G_BITCAST %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: bitcast_s64_gpr +name: bitcast_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64all } +# CHECK-NEXT: - { id: 1, class: gpr64all } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %x0 + + %0(s64) = COPY %x0 + %1(s64) = G_BITCAST %0 + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: bitcast_s64_fpr +name: bitcast_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %d0 + + %0(s64) = COPY %d0 + %1(s64) = G_BITCAST %0 + %d0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: bitcast_s64_gpr_fpr +name: bitcast_s64_gpr_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64all } +# CHECK-NEXT: - { id: 1, class: fpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %x0 + + %0(s64) = COPY %x0 + %1(s64) = G_BITCAST %0 + %d0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: bitcast_s64_fpr_gpr +name: bitcast_s64_fpr_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64all } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %d0 + + %0(s64) = COPY %d0 + %1(s64) = G_BITCAST %0 + %x0 = COPY %1(s64) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-br.mir b/test/CodeGen/AArch64/GlobalISel/select-br.mir new file mode 100644 index 000000000000..f46f190260f6 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-br.mir @@ -0,0 +1,71 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @unconditional_br() { ret void } + define void @conditional_br() { ret void } + define void @indirect_br() { ret void } +... + +--- +# CHECK-LABEL: name: unconditional_br +name: unconditional_br +legalized: true +regBankSelected: true + +# CHECK: body: +# CHECK: bb.0: +# CHECK: successors: %bb.0 +# CHECK: B %bb.0 +body: | + bb.0: + successors: %bb.0 + + G_BR %bb.0 +... + +--- +# CHECK-LABEL: name: conditional_br +name: conditional_br +legalized: true +regBankSelected: true + +registers: + - { id: 0, class: gpr } + +# CHECK: body: +# CHECK: bb.0: +# CHECK: TBNZW %0, 0, %bb.1 +# CHECK: B %bb.0 +body: | + bb.0: + successors: %bb.0, %bb.1 + %0(s1) = COPY %w0 + G_BRCOND %0(s1), %bb.1 + G_BR %bb.0 + + bb.1: +... + +--- +# CHECK-LABEL: name: indirect_br +name: indirect_br +legalized: true +regBankSelected: true + +registers: + - { id: 0, class: gpr } + +# CHECK: body: +# CHECK: bb.0: +# CHECK: %0 = COPY %x0 +# CHECK: BR %0 +body: | + bb.0: + successors: %bb.0, %bb.1 + %0(p0) = COPY %x0 + G_BRINDIRECT %0(p0) + + bb.1: +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-cbz.mir b/test/CodeGen/AArch64/GlobalISel/select-cbz.mir new file mode 100644 index 000000000000..2decb994b967 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-cbz.mir @@ -0,0 +1,108 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + define void @cbz_s32() { ret void } + define void @cbz_s64() { ret void } + define void @cbnz_s32() { ret void } + define void @cbnz_s64() { ret void } +... + +--- +# CHECK-LABEL: name: cbz_s32 +name: cbz_s32 +legalized: true +regBankSelected: true + +# CHECK: body: +# CHECK: bb.0: +# CHECK: %0 = COPY %w0 +# CHECK: CBZW %0, %bb.1 +# CHECK: B %bb.0 +body: | + bb.0: + liveins: %w0 + successors: %bb.0, %bb.1 + + %0:gpr(s32) = COPY %w0 + %1:gpr(s32) = G_CONSTANT i32 0 + %2:gpr(s1) = G_ICMP intpred(eq), %0, %1 + G_BRCOND %2(s1), %bb.1 + G_BR %bb.0 + + bb.1: +... + +--- +# CHECK-LABEL: name: cbz_s64 +name: cbz_s64 +legalized: true +regBankSelected: true + +# CHECK: body: +# CHECK: bb.0: +# CHECK: %0 = COPY %x0 +# CHECK: CBZX %0, %bb.1 +# CHECK: B %bb.0 +body: | + bb.0: + liveins: %x0 + successors: %bb.0, %bb.1 + + %0:gpr(s64) = COPY %x0 + %1:gpr(s64) = G_CONSTANT i64 0 + %2:gpr(s1) = G_ICMP intpred(eq), %0, %1 + G_BRCOND %2(s1), %bb.1 + G_BR %bb.0 + + bb.1: +... + +--- +# CHECK-LABEL: name: cbnz_s32 +name: cbnz_s32 +legalized: true +regBankSelected: true + +# CHECK: body: +# CHECK: bb.0: +# CHECK: %0 = COPY %w0 +# CHECK: CBNZW %0, %bb.1 +# CHECK: B %bb.0 +body: | + bb.0: + liveins: %w0 + successors: %bb.0, %bb.1 + + %0:gpr(s32) = COPY %w0 + %1:gpr(s32) = G_CONSTANT i32 0 + %2:gpr(s1) = G_ICMP intpred(ne), %0, %1 + G_BRCOND %2(s1), %bb.1 + G_BR %bb.0 + + bb.1: +... + +--- +# CHECK-LABEL: name: cbnz_s64 +name: cbnz_s64 +legalized: true +regBankSelected: true + +# CHECK: body: +# CHECK: bb.0: +# CHECK: %0 = COPY %x0 +# CHECK: CBNZX %0, %bb.1 +# CHECK: B %bb.0 +body: | + bb.0: + liveins: %x0 + successors: %bb.0, %bb.1 + + %0:gpr(s64) = COPY %x0 + %1:gpr(s64) = G_CONSTANT i64 0 + %2:gpr(s1) = G_ICMP intpred(ne), %0, %1 + G_BRCOND %2(s1), %bb.1 + G_BR %bb.0 + + bb.1: +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-constant.mir b/test/CodeGen/AArch64/GlobalISel/select-constant.mir new file mode 100644 index 000000000000..1a5bac9fb7d6 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-constant.mir @@ -0,0 +1,77 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define i32 @const_s32() { ret i32 42 } + define i64 @const_s64() { ret i64 1234567890123 } + + define i32 @fconst_s32() { ret i32 42 } + define i64 @fconst_s64() { ret i64 1234567890123 } +... + +--- +# CHECK-LABEL: name: const_s32 +name: const_s32 +legalized: true +regBankSelected: true +registers: + - { id: 0, class: gpr } + +# CHECK: body: +# CHECK: %0 = MOVi32imm 42 +body: | + bb.0: + %0(s32) = G_CONSTANT i32 42 + %w0 = COPY %0(s32) +... + +--- +# CHECK-LABEL: name: const_s64 +name: const_s64 +legalized: true +regBankSelected: true +registers: + - { id: 0, class: gpr } + +# CHECK: body: +# CHECK: %0 = MOVi64imm 1234567890123 +body: | + bb.0: + %0(s64) = G_CONSTANT i64 1234567890123 + %x0 = COPY %0(s64) +... + +--- +# CHECK-LABEL: name: fconst_s32 +name: fconst_s32 +legalized: true +regBankSelected: true +registers: + - { id: 0, class: fpr } + +# CHECK: body: +# CHECK: [[TMP:%[0-9]+]] = MOVi32imm 1080033280 +# CHECK: %0 = COPY [[TMP]] +body: | + bb.0: + %0(s32) = G_FCONSTANT float 3.5 + %s0 = COPY %0(s32) +... + +--- +# CHECK-LABEL: name: fconst_s64 +name: fconst_s64 +legalized: true +regBankSelected: true +registers: + - { id: 0, class: fpr } + +# CHECK: body: +# CHECK: [[TMP:%[0-9]+]] = MOVi64imm 4607182418800017408 +# CHECK: %0 = COPY [[TMP]] +body: | + bb.0: + %0(s64) = G_FCONSTANT double 1.0 + %d0 = COPY %0(s64) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir b/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir new file mode 100644 index 000000000000..2f36ec8d2aaa --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir @@ -0,0 +1,69 @@ +# RUN: llc -O0 -mtriple arm64-- -run-pass=instruction-select -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @test_dbg_value(i32 %a) !dbg !5 { + %tmp0 = add i32 %a, %a + call void @llvm.dbg.value(metadata i32 %tmp0, i64 0, metadata !7, metadata !9), !dbg !10 + ret void + } + + define void @test_dbg_value_dead(i32 %a) !dbg !5 { + call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !7, metadata !9), !dbg !10 + ret void + } + + declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!3, !4} + + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "llvm", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) + !1 = !DIFile(filename: "test.ll", directory: "/tmp") + !2 = !{} + !3 = !{i32 2, !"Dwarf Version", i32 4} + !4 = !{i32 2, !"Debug Info Version", i32 3} + !5 = distinct !DISubprogram(name: "test_dbg_value", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) + !6 = !DISubroutineType(types: !2) + !7 = !DILocalVariable(name: "in", arg: 1, scope: !5, file: !1, line: 1, type: !8) + !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !9 = !DIExpression() + !10 = !DILocation(line: 1, column: 1, scope: !5) +... + +--- +# CHECK-LABEL: name: test_dbg_value +name: test_dbg_value +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: %w0 + %0:gpr(s32) = COPY %w0 + %1:gpr(s32) = G_ADD %0, %0 + %w0 = COPY %1(s32) + + ; CHECK: %0 = COPY %w0 + ; CHECK-NEXT: %1 = ADDWrr %0, %0 + ; CHECK-NEXT: %w0 = COPY %1 + ; CHECK-NEXT: DBG_VALUE debug-use %1, debug-use _, !7, !9, debug-location !10 + + DBG_VALUE debug-use %1(s32), debug-use _, !7, !9, debug-location !10 +... + +--- +# CHECK-LABEL: name: test_dbg_value_dead +name: test_dbg_value_dead +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: %w0 + %0:gpr(s32) = COPY %w0 + + ; CHECK-NOT: COPY + ; CHECK: DBG_VALUE debug-use _, debug-use _, !7, !9, debug-location !10 + + DBG_VALUE debug-use %0(s32), debug-use _, !7, !9, debug-location !10 +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir b/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir new file mode 100644 index 000000000000..fbb11a1c7a4c --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir @@ -0,0 +1,478 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @fptrunc() { ret void } + define void @fpext() { ret void } + + define void @sitofp_s32_s32_fpr() { ret void } + define void @sitofp_s32_s64_fpr() { ret void } + define void @sitofp_s64_s32_fpr() { ret void } + define void @sitofp_s64_s64_fpr() { ret void } + + define void @uitofp_s32_s32_fpr() { ret void } + define void @uitofp_s32_s64_fpr() { ret void } + define void @uitofp_s64_s32_fpr() { ret void } + define void @uitofp_s64_s64_fpr() { ret void } + + define void @fptosi_s32_s32_gpr() { ret void } + define void @fptosi_s32_s64_gpr() { ret void } + define void @fptosi_s64_s32_gpr() { ret void } + define void @fptosi_s64_s64_gpr() { ret void } + + define void @fptoui_s32_s32_gpr() { ret void } + define void @fptoui_s32_s64_gpr() { ret void } + define void @fptoui_s64_s32_gpr() { ret void } + define void @fptoui_s64_s64_gpr() { ret void } +... + +--- +# CHECK-LABEL: name: fptrunc +name: fptrunc +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK: - { id: 0, class: fpr64 } +# CHECK: - { id: 1, class: fpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = FCVTSDr %0 +body: | + bb.0: + liveins: %d0 + + %0(s64) = COPY %d0 + %1(s32) = G_FPTRUNC %0 + %s0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: fpext +name: fpext +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK: - { id: 0, class: fpr32 } +# CHECK: - { id: 1, class: fpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = FCVTDSr %0 +body: | + bb.0: + liveins: %d0 + + %0(s32) = COPY %s0 + %1(s64) = G_FPEXT %0 + %d0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: sitofp_s32_s32_fpr +name: sitofp_s32_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: fpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = SCVTFUWSri %0 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s32) = G_SITOFP %0 + %s0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: sitofp_s32_s64_fpr +name: sitofp_s32_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: fpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = SCVTFUXSri %0 +body: | + bb.0: + liveins: %x0 + + %0(s64) = COPY %x0 + %1(s32) = G_SITOFP %0 + %s0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: sitofp_s64_s32_fpr +name: sitofp_s64_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = SCVTFUWDri %0 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s64) = G_SITOFP %0 + %d0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: sitofp_s64_s64_fpr +name: sitofp_s64_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = SCVTFUXDri %0 +body: | + bb.0: + liveins: %x0 + + %0(s64) = COPY %x0 + %1(s64) = G_SITOFP %0 + %d0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: uitofp_s32_s32_fpr +name: uitofp_s32_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: fpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = UCVTFUWSri %0 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s32) = G_UITOFP %0 + %s0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: uitofp_s32_s64_fpr +name: uitofp_s32_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: fpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = UCVTFUXSri %0 +body: | + bb.0: + liveins: %x0 + + %0(s64) = COPY %x0 + %1(s32) = G_UITOFP %0 + %s0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: uitofp_s64_s32_fpr +name: uitofp_s64_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = UCVTFUWDri %0 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s64) = G_UITOFP %0 + %d0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: uitofp_s64_s64_fpr +name: uitofp_s64_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: fpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = UCVTFUXDri %0 +body: | + bb.0: + liveins: %x0 + + %0(s64) = COPY %x0 + %1(s64) = G_UITOFP %0 + %d0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: fptosi_s32_s32_gpr +name: fptosi_s32_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = FCVTZSUWSr %0 +body: | + bb.0: + liveins: %s0 + + %0(s32) = COPY %s0 + %1(s32) = G_FPTOSI %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: fptosi_s32_s64_gpr +name: fptosi_s32_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = FCVTZSUWDr %0 +body: | + bb.0: + liveins: %d0 + + %0(s64) = COPY %d0 + %1(s32) = G_FPTOSI %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: fptosi_s64_s32_gpr +name: fptosi_s64_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = FCVTZSUXSr %0 +body: | + bb.0: + liveins: %s0 + + %0(s32) = COPY %s0 + %1(s64) = G_FPTOSI %0 + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: fptosi_s64_s64_gpr +name: fptosi_s64_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = FCVTZSUXDr %0 +body: | + bb.0: + liveins: %d0 + + %0(s64) = COPY %d0 + %1(s64) = G_FPTOSI %0 + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: fptoui_s32_s32_gpr +name: fptoui_s32_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = FCVTZUUWSr %0 +body: | + bb.0: + liveins: %s0 + + %0(s32) = COPY %s0 + %1(s32) = G_FPTOUI %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: fptoui_s32_s64_gpr +name: fptoui_s32_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = FCVTZUUWDr %0 +body: | + bb.0: + liveins: %d0 + + %0(s64) = COPY %d0 + %1(s32) = G_FPTOUI %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: fptoui_s64_s32_gpr +name: fptoui_s64_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %s0 +# CHECK: %1 = FCVTZUUXSr %0 +body: | + bb.0: + liveins: %s0 + + %0(s32) = COPY %s0 + %1(s64) = G_FPTOUI %0 + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: fptoui_s64_s64_gpr +name: fptoui_s64_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %d0 +# CHECK: %1 = FCVTZUUXDr %0 +body: | + bb.0: + liveins: %d0 + + %0(s64) = COPY %d0 + %1(s64) = G_FPTOUI %0 + %x0 = COPY %1(s64) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir new file mode 100644 index 000000000000..2ba8b7366252 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir @@ -0,0 +1,274 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @anyext_s64_from_s32() { ret void } + define void @anyext_s32_from_s8() { ret void } + + define void @zext_s64_from_s32() { ret void } + define void @zext_s32_from_s16() { ret void } + define void @zext_s32_from_s8() { ret void } + define void @zext_s16_from_s8() { ret void } + + define void @sext_s64_from_s32() { ret void } + define void @sext_s32_from_s16() { ret void } + define void @sext_s32_from_s8() { ret void } + define void @sext_s16_from_s8() { ret void } +... + +--- +# CHECK-LABEL: name: anyext_s64_from_s32 +name: anyext_s64_from_s32 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32all } +# CHECK-NEXT: - { id: 1, class: gpr64all } +# CHECK-NEXT: - { id: 2, class: gpr64all } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %2 = SUBREG_TO_REG 0, %0, 15 +# CHECK: %1 = COPY %2 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s64) = G_ANYEXT %0 + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: anyext_s32_from_s8 +name: anyext_s32_from_s8 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32all } +# CHECK-NEXT: - { id: 1, class: gpr32all } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %w0 + + %0(s8) = COPY %w0 + %1(s32) = G_ANYEXT %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: zext_s64_from_s32 +name: zext_s64_from_s32 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %2 = SUBREG_TO_REG 0, %0, 15 +# CHECK: %1 = UBFMXri %2, 0, 31 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s64) = G_ZEXT %0 + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: zext_s32_from_s16 +name: zext_s32_from_s16 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = UBFMWri %0, 0, 15 +body: | + bb.0: + liveins: %w0 + + %0(s16) = COPY %w0 + %1(s32) = G_ZEXT %0 + %w0 = COPY %1 +... + +--- +# CHECK-LABEL: name: zext_s32_from_s8 +name: zext_s32_from_s8 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = UBFMWri %0, 0, 7 +body: | + bb.0: + liveins: %w0 + + %0(s8) = COPY %w0 + %1(s32) = G_ZEXT %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: zext_s16_from_s8 +name: zext_s16_from_s8 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = UBFMWri %0, 0, 7 +body: | + bb.0: + liveins: %w0 + + %0(s8) = COPY %w0 + %1(s16) = G_ZEXT %0 + %w0 = COPY %1(s16) +... + +--- +# CHECK-LABEL: name: sext_s64_from_s32 +name: sext_s64_from_s32 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %2 = SUBREG_TO_REG 0, %0, 15 +# CHECK: %1 = SBFMXri %2, 0, 31 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s64) = G_SEXT %0 + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: sext_s32_from_s16 +name: sext_s32_from_s16 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = SBFMWri %0, 0, 15 +body: | + bb.0: + liveins: %w0 + + %0(s16) = COPY %w0 + %1(s32) = G_SEXT %0 + %w0 = COPY %1 +... + +--- +# CHECK-LABEL: name: sext_s32_from_s8 +name: sext_s32_from_s8 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = SBFMWri %0, 0, 7 +body: | + bb.0: + liveins: %w0 + + %0(s8) = COPY %w0 + %1(s32) = G_SEXT %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: sext_s16_from_s8 +name: sext_s16_from_s8 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = SBFMWri %0, 0, 7 +body: | + bb.0: + liveins: %w0 + + %0(s8) = COPY %w0 + %1(s16) = G_SEXT %0 + %w0 = COPY %1(s16) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-int-ptr-casts.mir b/test/CodeGen/AArch64/GlobalISel/select-int-ptr-casts.mir new file mode 100644 index 000000000000..6537408f6d98 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-int-ptr-casts.mir @@ -0,0 +1,150 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @inttoptr_p0_s64() { ret void } + define void @ptrtoint_s64_p0() { ret void } + define void @ptrtoint_s32_p0() { ret void } + define void @ptrtoint_s16_p0() { ret void } + define void @ptrtoint_s8_p0() { ret void } + define void @ptrtoint_s1_p0() { ret void } +... + +--- +# CHECK-LABEL: name: inttoptr_p0_s64 +name: inttoptr_p0_s64 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64all } +# CHECK-NEXT: - { id: 1, class: gpr64all } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %x0 + %0(s64) = COPY %x0 + %1(p0) = G_INTTOPTR %0 + %x0 = COPY %1(p0) +... + +--- +# CHECK-LABEL: name: ptrtoint_s64_p0 +name: ptrtoint_s64_p0 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %x0 + %0(p0) = COPY %x0 + %1(s64) = G_PTRTOINT %0 + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: ptrtoint_s32_p0 +name: ptrtoint_s32_p0 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %0.sub_32 +body: | + bb.0: + liveins: %x0 + %0(p0) = COPY %x0 + %1(s32) = G_PTRTOINT %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: ptrtoint_s16_p0 +name: ptrtoint_s16_p0 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %0.sub_32 +body: | + bb.0: + liveins: %x0 + %0(p0) = COPY %x0 + %1(s16) = G_PTRTOINT %0 + %w0 = COPY %1(s16) +... + +--- +# CHECK-LABEL: name: ptrtoint_s8_p0 +name: ptrtoint_s8_p0 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %0.sub_32 +body: | + bb.0: + liveins: %x0 + %0(p0) = COPY %x0 + %1(s8) = G_PTRTOINT %0 + %w0 = COPY %1(s8) +... + +--- +# CHECK-LABEL: name: ptrtoint_s1_p0 +name: ptrtoint_s1_p0 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %0.sub_32 +body: | + bb.0: + liveins: %x0 + %0(p0) = COPY %x0 + %1(s1) = G_PTRTOINT %0 + %w0 = COPY %1(s1) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-load.mir b/test/CodeGen/AArch64/GlobalISel/select-load.mir new file mode 100644 index 000000000000..9188e2b0c0fc --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-load.mir @@ -0,0 +1,515 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @load_s64_gpr(i64* %addr) { ret void } + define void @load_s32_gpr(i32* %addr) { ret void } + define void @load_s16_gpr(i16* %addr) { ret void } + define void @load_s8_gpr(i8* %addr) { ret void } + + define void @load_fi_s64_gpr() { + %ptr0 = alloca i64 + ret void + } + + define void @load_gep_128_s64_gpr(i64* %addr) { ret void } + define void @load_gep_512_s32_gpr(i32* %addr) { ret void } + define void @load_gep_64_s16_gpr(i16* %addr) { ret void } + define void @load_gep_1_s8_gpr(i8* %addr) { ret void } + + define void @load_s64_fpr(i64* %addr) { ret void } + define void @load_s32_fpr(i32* %addr) { ret void } + define void @load_s16_fpr(i16* %addr) { ret void } + define void @load_s8_fpr(i8* %addr) { ret void } + + define void @load_gep_8_s64_fpr(i64* %addr) { ret void } + define void @load_gep_16_s32_fpr(i32* %addr) { ret void } + define void @load_gep_64_s16_fpr(i16* %addr) { ret void } + define void @load_gep_32_s8_fpr(i8* %addr) { ret void } + +... + +--- +# CHECK-LABEL: name: load_s64_gpr +name: load_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = LDRXui %0, 0 :: (load 8 from %ir.addr) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_LOAD %0 :: (load 8 from %ir.addr) + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: load_s32_gpr +name: load_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = LDRWui %0, 0 :: (load 4 from %ir.addr) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s32) = G_LOAD %0 :: (load 4 from %ir.addr) + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: load_s16_gpr +name: load_s16_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = LDRHHui %0, 0 :: (load 2 from %ir.addr) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s16) = G_LOAD %0 :: (load 2 from %ir.addr) + %w0 = COPY %1(s16) +... + +--- +# CHECK-LABEL: name: load_s8_gpr +name: load_s8_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = LDRBBui %0, 0 :: (load 1 from %ir.addr) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s8) = G_LOAD %0 :: (load 1 from %ir.addr) + %w0 = COPY %1(s8) +... + +--- +# CHECK-LABEL: name: load_fi_s64_gpr +name: load_fi_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +stack: + - { id: 0, name: ptr0, offset: 0, size: 8, alignment: 8 } + +# CHECK: body: +# CHECK: %1 = LDRXui %stack.0.ptr0, 0 :: (load 8) +# CHECK: %x0 = COPY %1 +body: | + bb.0: + liveins: %x0 + + %0(p0) = G_FRAME_INDEX %stack.0.ptr0 + %1(s64) = G_LOAD %0 :: (load 8) + %x0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: load_gep_128_s64_gpr +name: load_gep_128_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %3 = LDRXui %0, 16 :: (load 8 from %ir.addr) +# CHECK: %x0 = COPY %3 +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 128 + %2(p0) = G_GEP %0, %1 + %3(s64) = G_LOAD %2 :: (load 8 from %ir.addr) + %x0 = COPY %3 +... + +--- +# CHECK-LABEL: name: load_gep_512_s32_gpr +name: load_gep_512_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %3 = LDRWui %0, 128 :: (load 4 from %ir.addr) +# CHECK: %w0 = COPY %3 +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 512 + %2(p0) = G_GEP %0, %1 + %3(s32) = G_LOAD %2 :: (load 4 from %ir.addr) + %w0 = COPY %3 +... + +--- +# CHECK-LABEL: name: load_gep_64_s16_gpr +name: load_gep_64_s16_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %3 = LDRHHui %0, 32 :: (load 2 from %ir.addr) +# CHECK: %w0 = COPY %3 +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 64 + %2(p0) = G_GEP %0, %1 + %3(s16) = G_LOAD %2 :: (load 2 from %ir.addr) + %w0 = COPY %3 +... + +--- +# CHECK-LABEL: name: load_gep_1_s8_gpr +name: load_gep_1_s8_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %3 = LDRBBui %0, 1 :: (load 1 from %ir.addr) +# CHECK: %w0 = COPY %3 +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 1 + %2(p0) = G_GEP %0, %1 + %3(s8) = G_LOAD %2 :: (load 1 from %ir.addr) + %w0 = COPY %3 +... + +--- +# CHECK-LABEL: name: load_s64_fpr +name: load_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: fpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = LDRDui %0, 0 :: (load 8 from %ir.addr) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_LOAD %0 :: (load 8 from %ir.addr) + %d0 = COPY %1(s64) +... + +--- +# CHECK-LABEL: name: load_s32_fpr +name: load_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: fpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = LDRSui %0, 0 :: (load 4 from %ir.addr) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s32) = G_LOAD %0 :: (load 4 from %ir.addr) + %s0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: load_s16_fpr +name: load_s16_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: fpr16 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = LDRHui %0, 0 :: (load 2 from %ir.addr) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s16) = G_LOAD %0 :: (load 2 from %ir.addr) + %h0 = COPY %1(s16) +... + +--- +# CHECK-LABEL: name: load_s8_fpr +name: load_s8_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: fpr8 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = LDRBui %0, 0 :: (load 1 from %ir.addr) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s8) = G_LOAD %0 :: (load 1 from %ir.addr) + %b0 = COPY %1(s8) +... + +--- +# CHECK-LABEL: name: load_gep_8_s64_fpr +name: load_gep_8_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: fpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %3 = LDRDui %0, 1 :: (load 8 from %ir.addr) +# CHECK: %d0 = COPY %3 +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 8 + %2(p0) = G_GEP %0, %1 + %3(s64) = G_LOAD %2 :: (load 8 from %ir.addr) + %d0 = COPY %3 +... + +--- +# CHECK-LABEL: name: load_gep_16_s32_fpr +name: load_gep_16_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: fpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %3 = LDRSui %0, 4 :: (load 4 from %ir.addr) +# CHECK: %s0 = COPY %3 +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 16 + %2(p0) = G_GEP %0, %1 + %3(s32) = G_LOAD %2 :: (load 4 from %ir.addr) + %s0 = COPY %3 +... + +--- +# CHECK-LABEL: name: load_gep_64_s16_fpr +name: load_gep_64_s16_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: fpr16 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %3 = LDRHui %0, 32 :: (load 2 from %ir.addr) +# CHECK: %h0 = COPY %3 +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 64 + %2(p0) = G_GEP %0, %1 + %3(s16) = G_LOAD %2 :: (load 2 from %ir.addr) + %h0 = COPY %3 +... + +--- +# CHECK-LABEL: name: load_gep_32_s8_fpr +name: load_gep_32_s8_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: fpr8 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %3 = LDRBui %0, 32 :: (load 1 from %ir.addr) +# CHECK: %b0 = COPY %3 +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 32 + %2(p0) = G_GEP %0, %1 + %3(s8) = G_LOAD %2 :: (load 1 from %ir.addr) + %b0 = COPY %3 +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-muladd.mir b/test/CodeGen/AArch64/GlobalISel/select-muladd.mir new file mode 100644 index 000000000000..7d5b43bc16d5 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-muladd.mir @@ -0,0 +1,50 @@ +# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @SMADDLrrr_gpr() { ret void } +... + +--- +# CHECK-LABEL: name: SMADDLrrr_gpr +name: SMADDLrrr_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +# CHECK-NEXT: - { id: 3, class: gpr } +# CHECK-NEXT: - { id: 4, class: gpr } +# CHECK-NEXT: - { id: 5, class: gpr } +# CHECK-NEXT: - { id: 6, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: gpr } + - { id: 5, class: gpr } + - { id: 6, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = COPY %w2 +# CHECK: %6 = SMADDLrrr %1, %2, %0 +body: | + bb.0: + liveins: %x0, %w1, %w2 + + %0(s64) = COPY %x0 + %1(s32) = COPY %w1 + %2(s32) = COPY %w2 + %3(s64) = G_SEXT %1 + %4(s64) = G_SEXT %2 + %5(s64) = G_MUL %3, %4 + %6(s64) = G_ADD %0, %5 + %x0 = COPY %6 +... + diff --git a/test/CodeGen/AArch64/GlobalISel/select-property.mir b/test/CodeGen/AArch64/GlobalISel/select-property.mir new file mode 100644 index 000000000000..86961ac597e1 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-property.mir @@ -0,0 +1,21 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @selected_property() { ret void } +... + +--- +# Check that we set the "selected" property. +# CHECK-LABEL: name: selected_property +# CHECK: legalized: true +# CHECK-NEXT: regBankSelected: true +# CHECK-NEXT: selected: true +name: selected_property +legalized: true +regBankSelected: true +selected: false +body: | + bb.0: +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-store.mir b/test/CodeGen/AArch64/GlobalISel/select-store.mir new file mode 100644 index 000000000000..9b8f5c566ce0 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-store.mir @@ -0,0 +1,463 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @store_s64_gpr(i64* %addr) { ret void } + define void @store_s32_gpr(i32* %addr) { ret void } + define void @store_s16_gpr(i16* %addr) { ret void } + define void @store_s8_gpr(i8* %addr) { ret void } + + define void @store_zero_s64_gpr(i64* %addr) { ret void } + define void @store_zero_s32_gpr(i32* %addr) { ret void } + + define void @store_fi_s64_gpr() { + %ptr0 = alloca i64 + ret void + } + + define void @store_gep_128_s64_gpr(i64* %addr) { ret void } + define void @store_gep_512_s32_gpr(i32* %addr) { ret void } + define void @store_gep_64_s16_gpr(i16* %addr) { ret void } + define void @store_gep_1_s8_gpr(i8* %addr) { ret void } + + define void @store_s64_fpr(i64* %addr) { ret void } + define void @store_s32_fpr(i32* %addr) { ret void } + + define void @store_gep_8_s64_fpr(i64* %addr) { ret void } + define void @store_gep_8_s32_fpr(i32* %addr) { ret void } +... + +--- +# CHECK-LABEL: name: store_s64_gpr +name: store_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: STRXui %1, %0, 0 :: (store 8 into %ir.addr) +body: | + bb.0: + liveins: %x0, %x1 + + %0(p0) = COPY %x0 + %1(s64) = COPY %x1 + G_STORE %1, %0 :: (store 8 into %ir.addr) + +... + +--- +# CHECK-LABEL: name: store_s32_gpr +name: store_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %w1 +# CHECK: STRWui %1, %0, 0 :: (store 4 into %ir.addr) +body: | + bb.0: + liveins: %x0, %w1 + + %0(p0) = COPY %x0 + %1(s32) = COPY %w1 + G_STORE %1, %0 :: (store 4 into %ir.addr) + +... + +--- +# CHECK-LABEL: name: store_s16_gpr +name: store_s16_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %w1 +# CHECK: STRHHui %1, %0, 0 :: (store 2 into %ir.addr) +body: | + bb.0: + liveins: %x0, %w1 + + %0(p0) = COPY %x0 + %1(s16) = COPY %w1 + G_STORE %1, %0 :: (store 2 into %ir.addr) + +... + +--- +# CHECK-LABEL: name: store_s8_gpr +name: store_s8_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %w1 +# CHECK: STRBBui %1, %0, 0 :: (store 1 into %ir.addr) +body: | + bb.0: + liveins: %x0, %w1 + + %0(p0) = COPY %x0 + %1(s8) = COPY %w1 + G_STORE %1, %0 :: (store 1 into %ir.addr) + +... + +--- +# CHECK-LABEL: name: store_zero_s64_gpr +name: store_zero_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: STRXui %xzr, %0, 0 :: (store 8 into %ir.addr) +body: | + bb.0: + liveins: %x0, %x1 + + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 0 + G_STORE %1, %0 :: (store 8 into %ir.addr) + +... + +--- +# CHECK-LABEL: name: store_zero_s32_gpr +name: store_zero_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: STRWui %wzr, %0, 0 :: (store 4 into %ir.addr) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(s32) = G_CONSTANT i32 0 + G_STORE %1, %0 :: (store 4 into %ir.addr) + +... + +--- +# CHECK-LABEL: name: store_fi_s64_gpr +name: store_fi_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +stack: + - { id: 0, name: ptr0, offset: 0, size: 8, alignment: 8 } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: STRXui %0, %stack.0.ptr0, 0 :: (store 8) +body: | + bb.0: + liveins: %x0 + + %0(p0) = COPY %x0 + %1(p0) = G_FRAME_INDEX %stack.0.ptr0 + G_STORE %0, %1 :: (store 8) +... + +--- +# CHECK-LABEL: name: store_gep_128_s64_gpr +name: store_gep_128_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: STRXui %1, %0, 16 :: (store 8 into %ir.addr) +body: | + bb.0: + liveins: %x0, %x1 + + %0(p0) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_CONSTANT i64 128 + %3(p0) = G_GEP %0, %2 + G_STORE %1, %3 :: (store 8 into %ir.addr) +... + +--- +# CHECK-LABEL: name: store_gep_512_s32_gpr +name: store_gep_512_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %w1 +# CHECK: STRWui %1, %0, 128 :: (store 4 into %ir.addr) +body: | + bb.0: + liveins: %x0, %w1 + + %0(p0) = COPY %x0 + %1(s32) = COPY %w1 + %2(s64) = G_CONSTANT i64 512 + %3(p0) = G_GEP %0, %2 + G_STORE %1, %3 :: (store 4 into %ir.addr) +... + +--- +# CHECK-LABEL: name: store_gep_64_s16_gpr +name: store_gep_64_s16_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %w1 +# CHECK: STRHHui %1, %0, 32 :: (store 2 into %ir.addr) +body: | + bb.0: + liveins: %x0, %w1 + + %0(p0) = COPY %x0 + %1(s16) = COPY %w1 + %2(s64) = G_CONSTANT i64 64 + %3(p0) = G_GEP %0, %2 + G_STORE %1, %3 :: (store 2 into %ir.addr) +... + +--- +# CHECK-LABEL: name: store_gep_1_s8_gpr +name: store_gep_1_s8_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %w1 +# CHECK: STRBBui %1, %0, 1 :: (store 1 into %ir.addr) +body: | + bb.0: + liveins: %x0, %w1 + + %0(p0) = COPY %x0 + %1(s8) = COPY %w1 + %2(s64) = G_CONSTANT i64 1 + %3(p0) = G_GEP %0, %2 + G_STORE %1, %3 :: (store 1 into %ir.addr) +... + +--- +# CHECK-LABEL: name: store_s64_fpr +name: store_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: fpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %d1 +# CHECK: STRDui %1, %0, 0 :: (store 8 into %ir.addr) +body: | + bb.0: + liveins: %x0, %d1 + + %0(p0) = COPY %x0 + %1(s64) = COPY %d1 + G_STORE %1, %0 :: (store 8 into %ir.addr) + +... + +--- +# CHECK-LABEL: name: store_s32_fpr +name: store_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: fpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %s1 +# CHECK: STRSui %1, %0, 0 :: (store 4 into %ir.addr) +body: | + bb.0: + liveins: %x0, %s1 + + %0(p0) = COPY %x0 + %1(s32) = COPY %s1 + G_STORE %1, %0 :: (store 4 into %ir.addr) + +... + +--- +# CHECK-LABEL: name: store_gep_8_s64_fpr +name: store_gep_8_s64_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: fpr64 } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %d1 +# CHECK: STRDui %1, %0, 1 :: (store 8 into %ir.addr) +body: | + bb.0: + liveins: %x0, %d1 + + %0(p0) = COPY %x0 + %1(s64) = COPY %d1 + %2(s64) = G_CONSTANT i64 8 + %3(p0) = G_GEP %0, %2 + G_STORE %1, %3 :: (store 8 into %ir.addr) +... + +--- +# CHECK-LABEL: name: store_gep_8_s32_fpr +name: store_gep_8_s32_fpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +# CHECK-NEXT: - { id: 1, class: fpr32 } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr } +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %s1 +# CHECK: STRSui %1, %0, 2 :: (store 4 into %ir.addr) +body: | + bb.0: + liveins: %x0, %s1 + + %0(p0) = COPY %x0 + %1(s32) = COPY %s1 + %2(s64) = G_CONSTANT i64 8 + %3(p0) = G_GEP %0, %2 + G_STORE %1, %3 :: (store 4 into %ir.addr) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-trunc.mir b/test/CodeGen/AArch64/GlobalISel/select-trunc.mir new file mode 100644 index 000000000000..fc3546e777f7 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-trunc.mir @@ -0,0 +1,81 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @trunc_s32_s64() { ret void } + define void @trunc_s8_s64() { ret void } + define void @trunc_s1_s32() { ret void } +... + +--- +# CHECK-LABEL: name: trunc_s32_s64 +name: trunc_s32_s64 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %1 = COPY %0.sub_32 +body: | + bb.0: + liveins: %x0 + + %0(s64) = COPY %x0 + %1(s32) = G_TRUNC %0 + %w0 = COPY %1(s32) +... + +--- +# CHECK-LABEL: name: trunc_s8_s64 +name: trunc_s8_s64 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %1 = COPY %0.sub_32 +body: | + bb.0: + liveins: %x0 + + %0(s64) = COPY %x0 + %1(s8) = G_TRUNC %0 + %w0 = COPY %1(s8) +... + +--- +# CHECK-LABEL: name: trunc_s1_s32 +name: trunc_s1_s32 +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: %1 = COPY %0 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s1) = G_TRUNC %0 + %w0 = COPY %1(s1) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-xor.mir b/test/CodeGen/AArch64/GlobalISel/select-xor.mir new file mode 100644 index 000000000000..e787849c8d1b --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-xor.mir @@ -0,0 +1,165 @@ +# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @xor_s32_gpr() { ret void } + define void @xor_s64_gpr() { ret void } + define void @xor_constant_n1_s32_gpr() { ret void } + define void @xor_constant_n1_s64_gpr() { ret void } + define void @xor_constant_n1_s32_gpr_2bb() { ret void } + +... + +--- +# Check that we select a 32-bit GPR G_XOR into EORWrr on GPR32. +# Also check that we constrain the register class of the COPY to GPR32. +# CHECK-LABEL: name: xor_s32_gpr +name: xor_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = EORWrr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_XOR %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as xor_s64_gpr, for 64-bit operations. +# CHECK-LABEL: name: xor_s64_gpr +name: xor_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr64 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %1 = COPY %x1 +# CHECK: %2 = EORXrr %0, %1 +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = G_XOR %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Check that we select a 32-bit GPR G_XOR into EORWrr on GPR32. +# Also check that we constrain the register class of the COPY to GPR32. +# CHECK-LABEL: name: xor_constant_n1_s32_gpr +name: xor_constant_n1_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %2 = ORNWrr %wzr, %0 +body: | + bb.0: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s32) = G_CONSTANT i32 -1 + %2(s32) = G_XOR %0, %1 + %w0 = COPY %2(s32) +... + +--- +# Same as xor_constant_n1_s64_gpr, for 64-bit operations. +# CHECK-LABEL: name: xor_constant_n1_s64_gpr +name: xor_constant_n1_s64_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64 } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %x0 +# CHECK: %2 = ORNXrr %xzr, %0 +body: | + bb.0: + liveins: %x0 + + %0(s64) = COPY %x0 + %1(s64) = G_CONSTANT i64 -1 + %2(s64) = G_XOR %0, %1 + %x0 = COPY %2(s64) +... + +--- +# Check that we can obtain constants from other basic blocks. +# CHECK-LABEL: name: xor_constant_n1_s32_gpr_2bb +name: xor_constant_n1_s32_gpr_2bb +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: B %bb.1 +# CHECK: %0 = COPY %w0 +# CHECK: %2 = ORNWrr %wzr, %0 + +body: | + bb.0: + liveins: %w0, %w1 + successors: %bb.1 + %1(s32) = G_CONSTANT i32 -1 + G_BR %bb.1 + bb.1: + %0(s32) = COPY %w0 + %2(s32) = G_XOR %0, %1 + %w0 = COPY %2(s32) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select.mir b/test/CodeGen/AArch64/GlobalISel/select.mir new file mode 100644 index 000000000000..8bffa085fdca --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select.mir @@ -0,0 +1,311 @@ +# RUN: llc -O0 -mtriple=aarch64-apple-ios -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=IOS +# RUN: llc -O0 -mtriple=aarch64-linux-gnu -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LINUX-DEFAULT +# RUN: llc -O0 -mtriple=aarch64-linux-gnu -relocation-model=pic -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LINUX-PIC + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @frame_index() { + %ptr0 = alloca i64 + ret void + } + + define i8* @gep(i8* %in) { ret i8* undef } + + define i8* @ptr_mask(i8* %in) { ret i8* undef } + + @var_local = global i8 0 + define i8* @global_local() { ret i8* undef } + + @var_got = external global i8 + define i8* @global_got() { ret i8* undef } + + define void @icmp() { ret void } + define void @fcmp() { ret void } + + define void @phi() { ret void } + + define void @select() { ret void } +... + +--- +# CHECK-LABEL: name: frame_index +name: frame_index +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr64sp } +registers: + - { id: 0, class: gpr } + +stack: + - { id: 0, name: ptr0, offset: 0, size: 8, alignment: 8 } + +# CHECK: body: +# CHECK: %0 = ADDXri %stack.0.ptr0, 0, 0 +body: | + bb.0: + %0(p0) = G_FRAME_INDEX %stack.0.ptr0 + %x0 = COPY %0(p0) +... + +--- +# CHECK-LABEL: name: gep +name: gep +legalized: true +regBankSelected: true +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %1 = MOVi64imm 42 +# CHECK: %2 = ADDXrr %0, %1 +body: | + bb.0: + liveins: %x0 + %0(p0) = COPY %x0 + %1(s64) = G_CONSTANT i64 42 + %2(p0) = G_GEP %0, %1(s64) + %x0 = COPY %2(p0) +... + +--- +# CHECK-LABEL: name: ptr_mask +name: ptr_mask +legalized: true +regBankSelected: true + +# CHECK: body: +# CHECK: %1 = ANDXri %0, 8060 +body: | + bb.0: + liveins: %x0 + %0:gpr(p0) = COPY %x0 + %1:gpr(p0) = G_PTR_MASK %0, 3 + %x0 = COPY %1(p0) +... + +--- +# Global defined in the same linkage unit so no GOT is needed +# CHECK-LABEL: name: global_local +name: global_local +legalized: true +regBankSelected: true +registers: + - { id: 0, class: gpr } + +# CHECK: body: +# IOS: %0 = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local +# LINUX-DEFAULT: %0 = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local +# LINUX-PIC: %0 = LOADgot target-flags(aarch64-got) @var_local +body: | + bb.0: + %0(p0) = G_GLOBAL_VALUE @var_local + %x0 = COPY %0(p0) +... + +--- +# CHECK-LABEL: name: global_got +name: global_got +legalized: true +regBankSelected: true +registers: + - { id: 0, class: gpr } + +# CHECK: body: +# IOS: %0 = LOADgot target-flags(aarch64-got) @var_got +# LINUX-DEFAULT: %0 = MOVaddr target-flags(aarch64-page) @var_got, target-flags(aarch64-pageoff, aarch64-nc) @var_got +# LINUX-PIC: %0 = LOADgot target-flags(aarch64-got) @var_got +body: | + bb.0: + %0(p0) = G_GLOBAL_VALUE @var_got + %x0 = COPY %0(p0) +... + +--- +# CHECK-LABEL: name: icmp +name: icmp +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr64 } +# CHECK-NEXT: - { id: 3, class: gpr32 } +# CHECK-NEXT: - { id: 4, class: gpr64 } +# CHECK-NEXT: - { id: 5, class: gpr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: gpr } + - { id: 5, class: gpr } + +# CHECK: body: +# CHECK: %wzr = SUBSWrr %0, %0, implicit-def %nzcv +# CHECK: %1 = CSINCWr %wzr, %wzr, 1, implicit %nzcv + +# CHECK: %xzr = SUBSXrr %2, %2, implicit-def %nzcv +# CHECK: %3 = CSINCWr %wzr, %wzr, 3, implicit %nzcv + +# CHECK: %xzr = SUBSXrr %4, %4, implicit-def %nzcv +# CHECK: %5 = CSINCWr %wzr, %wzr, 0, implicit %nzcv + +body: | + bb.0: + liveins: %w0, %x0 + + %0(s32) = COPY %w0 + %1(s1) = G_ICMP intpred(eq), %0, %0 + %w0 = COPY %1(s1) + + %2(s64) = COPY %x0 + %3(s1) = G_ICMP intpred(uge), %2, %2 + %w0 = COPY %3(s1) + + %4(p0) = COPY %x0 + %5(s1) = G_ICMP intpred(ne), %4, %4 + %w0 = COPY %5(s1) +... + +--- +# CHECK-LABEL: name: fcmp +name: fcmp +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: fpr64 } +# CHECK-NEXT: - { id: 3, class: gpr32 } +# CHECK-NEXT: - { id: 4, class: gpr32 } +# CHECK-NEXT: - { id: 5, class: gpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + - { id: 2, class: fpr } + - { id: 3, class: gpr } + +# CHECK: body: +# CHECK: FCMPSrr %0, %0, implicit-def %nzcv +# CHECK: [[TST_MI:%[0-9]+]] = CSINCWr %wzr, %wzr, 5, implicit %nzcv +# CHECK: [[TST_GT:%[0-9]+]] = CSINCWr %wzr, %wzr, 13, implicit %nzcv +# CHECK: %1 = ORRWrr [[TST_MI]], [[TST_GT]] + +# CHECK: FCMPDrr %2, %2, implicit-def %nzcv +# CHECK: %3 = CSINCWr %wzr, %wzr, 4, implicit %nzcv + +body: | + bb.0: + liveins: %w0, %x0 + + %0(s32) = COPY %s0 + %1(s1) = G_FCMP floatpred(one), %0, %0 + %w0 = COPY %1(s1) + + %2(s64) = COPY %d0 + %3(s1) = G_FCMP floatpred(uge), %2, %2 + %w0 = COPY %3(s1) + +... + +--- +# CHECK-LABEL: name: phi +name: phi +legalized: true +regBankSelected: true +tracksRegLiveness: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: fpr32 } +registers: + - { id: 0, class: fpr } + - { id: 1, class: gpr } + - { id: 2, class: fpr } + +# CHECK: body: +# CHECK: bb.1: +# CHECK: %2 = PHI %0, %bb.0, %2, %bb.1 + +body: | + bb.0: + liveins: %s0, %w0 + successors: %bb.1 + %0(s32) = COPY %s0 + %1(s1) = COPY %w0 + + bb.1: + successors: %bb.1, %bb.2 + %2(s32) = PHI %0, %bb.0, %2, %bb.1 + G_BRCOND %1, %bb.1 + + bb.2: + %s0 = COPY %2 + RET_ReallyLR implicit %s0 +... + +--- +# CHECK-LABEL: name: select +name: select +legalized: true +regBankSelected: true +tracksRegLiveness: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32 } +# CHECK-NEXT: - { id: 1, class: gpr32 } +# CHECK-NEXT: - { id: 2, class: gpr32 } +# CHECK-NEXT: - { id: 3, class: gpr32 } +# CHECK-NEXT: - { id: 4, class: gpr64 } +# CHECK-NEXT: - { id: 5, class: gpr64 } +# CHECK-NEXT: - { id: 6, class: gpr64 } +# CHECK-NEXT: - { id: 7, class: gpr64 } +# CHECK-NEXT: - { id: 8, class: gpr64 } +# CHECK-NEXT: - { id: 9, class: gpr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: gpr } + - { id: 5, class: gpr } + - { id: 6, class: gpr } + - { id: 7, class: gpr } + - { id: 8, class: gpr } + - { id: 9, class: gpr } + +# CHECK: body: +# CHECK: %wzr = ANDSWri %0, 0, implicit-def %nzcv +# CHECK: %3 = CSELWr %1, %2, 1, implicit %nzcv +# CHECK: %wzr = ANDSWri %0, 0, implicit-def %nzcv +# CHECK: %6 = CSELXr %4, %5, 1, implicit %nzcv +# CHECK: %wzr = ANDSWri %0, 0, implicit-def %nzcv +# CHECK: %9 = CSELXr %7, %8, 1, implicit %nzcv +body: | + bb.0: + liveins: %w0, %w1, %w2 + %0(s1) = COPY %w0 + + %1(s32) = COPY %w1 + %2(s32) = COPY %w2 + %3(s32) = G_SELECT %0, %1, %2 + %w0 = COPY %3(s32) + + %4(s64) = COPY %x0 + %5(s64) = COPY %x1 + %6(s64) = G_SELECT %0, %4, %5 + %x0 = COPY %6(s64) + + %7(p0) = COPY %x0 + %8(p0) = COPY %x1 + %9(p0) = G_SELECT %0, %7, %8 + %x0 = COPY %9(p0) +... diff --git a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll index 14dbc7c3c31a..e4c18757418d 100644 --- a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll +++ b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll @@ -58,8 +58,8 @@ define i32* @const_then_var(%type1* %addr, i64 %idx) { ; CHECK: [[BASE:%[0-9]+]](p0) = COPY %x0 ; CHECK: [[IDX:%[0-9]+]](s64) = COPY %x1 ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_CONSTANT i64 272 -; CHECK: [[BASE1:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET1]](s64) ; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT i64 4 +; CHECK: [[BASE1:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET1]](s64) ; CHECK: [[OFFSET2:%[0-9]+]](s64) = G_MUL [[SIZE]], [[IDX]] ; CHECK: [[BASE2:%[0-9]+]](p0) = G_GEP [[BASE1]], [[OFFSET2]](s64) ; CHECK: [[RES:%[0-9]+]](p0) = COPY [[BASE2]](p0) @@ -74,9 +74,9 @@ define i32* @var_then_const(%type1* %addr, i64 %idx) { ; CHECK: [[BASE:%[0-9]+]](p0) = COPY %x0 ; CHECK: [[IDX:%[0-9]+]](s64) = COPY %x1 ; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT i64 64 +; CHECK: [[OFFSET2:%[0-9]+]](s64) = G_CONSTANT i64 40 ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_MUL [[SIZE]], [[IDX]] ; CHECK: [[BASE1:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET1]](s64) -; CHECK: [[OFFSET2:%[0-9]+]](s64) = G_CONSTANT i64 40 ; CHECK: [[BASE2:%[0-9]+]](p0) = G_GEP [[BASE1]], [[OFFSET2]](s64) ; CHECK: %x0 = COPY [[BASE2]](p0) diff --git a/test/CodeGen/AArch64/GlobalISel/varargs-ios-translator.ll b/test/CodeGen/AArch64/GlobalISel/varargs-ios-translator.ll new file mode 100644 index 000000000000..3bd56fa4cebc --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/varargs-ios-translator.ll @@ -0,0 +1,16 @@ +; RUN: llc -mtriple=aarch64-apple-ios -stop-after=instruction-select -global-isel -verify-machineinstrs %s -o - | FileCheck %s + +define void @test_varargs_sentinel(i8* %list, i64, i64, i64, i64, i64, i64, i64, + i32, ...) { +; CHECK-LABEL: name: test_varargs_sentinel +; CHECK: fixedStack: +; CHECK: - { id: [[VARARGS_SLOT:[0-9]+]], offset: 8 +; CHECK: body: +; CHECK: [[LIST:%[0-9]+]] = COPY %x0 +; CHECK: [[VARARGS_AREA:%[0-9]+]] = ADDXri %fixed-stack.[[VARARGS_SLOT]], 0, 0 +; CHECK: STRXui [[VARARGS_AREA]], [[LIST]], 0 :: (store 8 into %ir.list, align 0) + call void @llvm.va_start(i8* %list) + ret void +} + +declare void @llvm.va_start(i8*) diff --git a/test/CodeGen/AArch64/GlobalISel/vastart.ll b/test/CodeGen/AArch64/GlobalISel/vastart.ll new file mode 100644 index 000000000000..ae44e8fc5dea --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/vastart.ll @@ -0,0 +1,13 @@ +; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - -mtriple=aarch64-apple-ios7.0 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-IOS %s +; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LINUX %s + + +declare void @llvm.va_start(i8*) +define void @test_va_start(i8* %list) { +; CHECK-LABEL: name: test_va_start +; CHECK: [[LIST:%[0-9]+]](p0) = COPY %x0 +; CHECK-IOS: G_VASTART [[LIST]](p0) :: (store 8 into %ir.list, align 0) +; CHECK-LINUX: G_VASTART [[LIST]](p0) :: (store 32 into %ir.list, align 0) + call void @llvm.va_start(i8* %list) + ret void +} diff --git a/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll b/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll new file mode 100644 index 000000000000..3fe7e65bf245 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll @@ -0,0 +1,68 @@ +; RUN: opt -codegenprepare < %s -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%struct.match_state = type { i64, i64 } + +; %add is also promoted by forking an extra sext. +define void @promoteTwoOne(i32 %i, i32 %j, i64* %P1, i64* %P2 ) { +; CHECK-LABEL: @promoteTwoOne +; CHECK-LABEL: entry: +; CHECK: %[[SEXT1:.*]] = sext i32 %i to i64 +; CHECK: %[[SEXT2:.*]] = sext i32 %j to i64 +; CHECK: %add = add nsw i64 %[[SEXT1]], %[[SEXT2]] +entry: + %add = add nsw i32 %i, %j + %s = sext i32 %add to i64 + %addr1 = getelementptr inbounds i64, i64* %P1, i64 %s + store i64 %s, i64* %addr1 + %s2 = sext i32 %i to i64 + %addr2 = getelementptr inbounds i64, i64* %P2, i64 %s2 + store i64 %s2, i64* %addr2 + ret void +} + +; Both %add1 and %add2 are promoted by forking extra sexts. +define void @promoteTwoTwo(i32 %i, i32 %j, i32 %k, i64* %P1, i64* %P2) { +; CHECK-LABEL: @promoteTwoTwo +; CHECK-LABEL:entry: +; CHECK: %[[SEXT1:.*]] = sext i32 %j to i64 +; CHECK: %[[SEXT2:.*]] = sext i32 %i to i64 +; CHECK: %add1 = add nsw i64 %[[SEXT1]], %[[SEXT2]] +; CHECK: %[[SEXT3:.*]] = sext i32 %k to i64 +; CHECK: %add2 = add nsw i64 %[[SEXT1]], %[[SEXT3]] +entry: + %add1 = add nsw i32 %j, %i + %s = sext i32 %add1 to i64 + %addr1 = getelementptr inbounds i64, i64* %P1, i64 %s + store i64 %s, i64* %addr1 + %add2 = add nsw i32 %j, %k + %s2 = sext i32 %add2 to i64 + %addr2 = getelementptr inbounds i64, i64* %P2, i64 %s2 + store i64 %s2, i64* %addr2 + ret void +} + +define i64 @promoteGEPSunk(i1 %cond, i64* %base, i32 %i) { +; CHECK-LABEL: @promoteGEPSunk +; CHECK-LABEL: entry: +; CHECK: %[[SEXT:.*]] = sext i32 %i to i64 +; CHECK: %add = add nsw i64 %[[SEXT]], 1 +; CHECK: %add2 = add nsw i64 %[[SEXT]], 2 +entry: + %add = add nsw i32 %i, 1 + %s = sext i32 %add to i64 + %addr = getelementptr inbounds i64, i64* %base, i64 %s + %add2 = add nsw i32 %i, 2 + %s2 = sext i32 %add2 to i64 + %addr2 = getelementptr inbounds i64, i64* %base, i64 %s2 + br i1 %cond, label %if.then, label %if.then2 +if.then: + %v = load i64, i64* %addr + %v2 = load i64, i64* %addr2 + %r = add i64 %v, %v2 + ret i64 %r +if.then2: + ret i64 0; +} diff --git a/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/test/CodeGen/AArch64/aarch64-fold-lslfast.ll new file mode 100644 index 000000000000..0dfe04b664d0 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-fold-lslfast.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s + +%struct.a = type [256 x i16] +%struct.b = type [256 x i32] +%struct.c = type [256 x i64] + +declare void @foo() +define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind { +; CHECK-LABEL: halfword: +; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8 +; CHECK: ldrh [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #1] +; CHECK: strh [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #1] + %shr81 = lshr i32 %xor72, 9 + %conv82 = zext i32 %shr81 to i64 + %idxprom83 = and i64 %conv82, 255 + %arrayidx86 = getelementptr inbounds %struct.a, %struct.a* %ctx, i64 0, i64 %idxprom83 + %result = load i16, i16* %arrayidx86, align 2 + call void @foo() + store i16 %result, i16* %arrayidx86, align 2 + ret i16 %result +} + +define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind { +; CHECK-LABEL: word: +; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8 +; CHECK: ldr [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #2] +; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #2] + %shr81 = lshr i32 %xor72, 9 + %conv82 = zext i32 %shr81 to i64 + %idxprom83 = and i64 %conv82, 255 + %arrayidx86 = getelementptr inbounds %struct.b, %struct.b* %ctx, i64 0, i64 %idxprom83 + %result = load i32, i32* %arrayidx86, align 4 + call void @foo() + store i32 %result, i32* %arrayidx86, align 4 + ret i32 %result +} + +define i64 @doubleword(%struct.c* %ctx, i32 %xor72) nounwind { +; CHECK-LABEL: doubleword: +; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8 +; CHECK: ldr [[REG1:x[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #3] +; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #3] + %shr81 = lshr i32 %xor72, 9 + %conv82 = zext i32 %shr81 to i64 + %idxprom83 = and i64 %conv82, 255 + %arrayidx86 = getelementptr inbounds %struct.c, %struct.c* %ctx, i64 0, i64 %idxprom83 + %result = load i64, i64* %arrayidx86, align 8 + call void @foo() + store i64 %result, i64* %arrayidx86, align 8 + ret i64 %result +} + +define i64 @multi_use_non_memory(i64 %a, i64 %b) { +; CHECK-LABEL: multi_use_non_memory: +; CHECK: lsl [[REG1:x[0-9]+]], x0, #3 +; CHECK-NOT: cmp [[REG1]], x1, lsl # 3 +; CHECK-NEXT: lsl [[REG2:x[0-9]+]], x1, #3 +; CHECK-NEXT: cmp [[REG1]], [[REG2]] +entry: + %mul1 = shl i64 %a, 3 + %mul2 = shl i64 %b, 3 + %cmp = icmp slt i64 %mul1, %mul2 + br i1 %cmp, label %truebb, label %falsebb +truebb: + tail call void @foo() + unreachable +falsebb: + %cmp2 = icmp sgt i64 %mul1, %mul2 + br i1 %cmp2, label %exitbb, label %endbb +exitbb: + ret i64 %mul1 +endbb: + ret i64 %mul2 +} diff --git a/test/CodeGen/AArch64/aarch64-gep-opt.ll b/test/CodeGen/AArch64/aarch64-gep-opt.ll index 6e4a47b04406..df9534ffde09 100644 --- a/test/CodeGen/AArch64/aarch64-gep-opt.ll +++ b/test/CodeGen/AArch64/aarch64-gep-opt.ll @@ -96,9 +96,13 @@ exit: ; CHECK-NoAA: add i64 [[TMP:%[a-zA-Z0-9]+]], 528 ; CHECK-NoAA: add i64 [[TMP]], 532 ; CHECK-NoAA: if.true: -; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 532 +; CHECK-NoAA: inttoptr +; CHECK-NoAA: bitcast +; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8, {{.*}}, i64 532 ; CHECK-NoAA: exit: -; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 528 +; CHECK-NoAA: inttoptr +; CHECK-NoAA: bitcast +; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8, {{.*}}, i64 528 ; CHECK-UseAA-LABEL: test_GEP_across_BB( ; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = getelementptr diff --git a/test/CodeGen/AArch64/aarch64-named-reg-w18.ll b/test/CodeGen/AArch64/aarch64-named-reg-w18.ll new file mode 100644 index 000000000000..341c7683dbaa --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-named-reg-w18.ll @@ -0,0 +1,14 @@ +; RUN: not llc -mtriple=aarch64-fuchsia -o - %s 2>&1 | FileCheck %s --check-prefix=ERROR +; RUN: llc -mtriple=aarch64-fuchsia -mattr=+reserve-x18 -o - %s + +define void @set_w18(i32 %x) { +entry: +; FIXME: Include an allocatable-specific error message +; ERROR: Invalid register name "w18". + tail call void @llvm.write_register.i32(metadata !0, i32 %x) + ret void +} + +declare void @llvm.write_register.i32(metadata, i32) nounwind + +!0 = !{!"w18"} diff --git a/test/CodeGen/AArch64/aarch64-named-reg-x18.ll b/test/CodeGen/AArch64/aarch64-named-reg-x18.ll new file mode 100644 index 000000000000..eed852710ba0 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-named-reg-x18.ll @@ -0,0 +1,14 @@ +; RUN: not llc -mtriple=aarch64-fuchsia -o - %s 2>&1 | FileCheck %s --check-prefix=ERROR +; RUN: llc -mtriple=aarch64-fuchsia -mattr=+reserve-x18 -o - %s + +define void @set_x18(i64 %x) { +entry: +; FIXME: Include an allocatable-specific error message +; ERROR: Invalid register name "x18". + tail call void @llvm.write_register.i64(metadata !0, i64 %x) + ret void +} + +declare void @llvm.write_register.i64(metadata, i64) nounwind + +!0 = !{!"x18"} diff --git a/test/CodeGen/AArch64/and-sink.ll b/test/CodeGen/AArch64/and-sink.ll new file mode 100644 index 000000000000..91b7bd0db172 --- /dev/null +++ b/test/CodeGen/AArch64/and-sink.ll @@ -0,0 +1,90 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s +; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s + +@A = global i32 zeroinitializer +@B = global i32 zeroinitializer +@C = global i32 zeroinitializer + +; Test that and is sunk into cmp block to form tbz. +define i32 @and_sink1(i32 %a, i1 %c) { +; CHECK-LABEL: and_sink1: +; CHECK: tbz w1, #0 +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A] +; CHECK: tbnz {{w[0-9]+}}, #2 + +; CHECK-CGP-LABEL: @and_sink1( +; CHECK-CGP-NOT: and i32 + %and = and i32 %a, 4 + br i1 %c, label %bb0, label %bb2 +bb0: +; CHECK-CGP-LABEL: bb0: +; CHECK-CGP: and i32 +; CHECK-CGP-NEXT: icmp eq i32 +; CHECK-CGP-NEXT: store +; CHECK-CGP-NEXT: br + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @A + br i1 %cmp, label %bb1, label %bb2 +bb1: + ret i32 1 +bb2: + ret i32 0 +} + +; Test that both 'and' and cmp get sunk to form tbz. +define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) { +; CHECK-LABEL: and_sink2: +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A] +; CHECK: tbz w1, #0 +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:B] +; CHECK: tbz w2, #0 +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:C] +; CHECK: tbnz {{w[0-9]+}}, #2 + +; CHECK-CGP-LABEL: @and_sink2( +; CHECK-CGP-NOT: and i32 + %and = and i32 %a, 4 + store i32 0, i32* @A + br i1 %c, label %bb0, label %bb3 +bb0: +; CHECK-CGP-LABEL: bb0: +; CHECK-CGP-NOT: and i32 +; CHECK-CGP-NOT: icmp + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @B + br i1 %c2, label %bb1, label %bb3 +bb1: +; CHECK-CGP-LABEL: bb1: +; CHECK-CGP: and i32 +; CHECK-CGP-NEXT: icmp eq i32 +; CHECK-CGP-NEXT: store +; CHECK-CGP-NEXT: br + store i32 0, i32* @C + br i1 %cmp, label %bb2, label %bb0 +bb2: + ret i32 1 +bb3: + ret i32 0 +} + +; Test that 'and' is not sunk since cbz is a better alternative. +define i32 @and_sink3(i32 %a) { +; CHECK-LABEL: and_sink3: +; CHECK: and [[REG:w[0-9]+]], w0, #0x3 +; CHECK: [[LOOP:.L[A-Z0-9_]+]]: +; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A] +; CHECK: cbz [[REG]], [[LOOP]] + +; CHECK-CGP-LABEL: @and_sink3( +; CHECK-CGP-NEXT: and i32 + %and = and i32 %a, 3 + br label %bb0 +bb0: +; CHECK-CGP-LABEL: bb0: +; CHECK-CGP-NOT: and i32 + %cmp = icmp eq i32 %and, 0 + store i32 0, i32* @A + br i1 %cmp, label %bb0, label %bb2 +bb2: + ret i32 0 +} diff --git a/test/CodeGen/AArch64/argument-blocks.ll b/test/CodeGen/AArch64/argument-blocks.ll index 3169abc2dcb3..b5374ca8ced5 100644 --- a/test/CodeGen/AArch64/argument-blocks.ll +++ b/test/CodeGen/AArch64/argument-blocks.ll @@ -59,10 +59,10 @@ define i64 @test_hfa_ignores_gprs([7 x float], [2 x float] %in, i64, i64 %res) { } ; [2 x float] should not be promoted to double by the Darwin varargs handling, -; but should go in an 8-byte aligned slot. +; but should go in an 8-byte aligned slot and can be merged as integer stores. define void @test_varargs_stackalign() { ; CHECK-LABEL: test_varargs_stackalign: -; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16] +; CHECK-DARWINPCS: str {{x[0-9]+}}, [sp, #16] call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0]) ret void diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll index a29f8c4b57ab..0a7965571480 100644 --- a/test/CodeGen/AArch64/arm64-abi-varargs.ll +++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll @@ -3,7 +3,7 @@ ; rdar://13625505 ; Here we have 9 fixed integer arguments the 9th argument in on stack, the ; varargs start right after at 8-byte alignment. -define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp { +define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp { ; CHECK-LABEL: fn9: ; 9th fixed argument ; CHECK: ldr {{w[0-9]+}}, [sp, #64] @@ -30,7 +30,6 @@ define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, %a10 = alloca i32, align 4 %a11 = alloca i32, align 4 %a12 = alloca i32, align 4 - store i32 %a1, i32* %1, align 4 store i32 %a2, i32* %2, align 4 store i32 %a3, i32* %3, align 4 store i32 %a4, i32* %4, align 4 @@ -39,6 +38,7 @@ define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, store i32 %a7, i32* %7, align 4 store i32 %a8, i32* %8, align 4 store i32 %a9, i32* %9, align 4 + store i32 %a9, i32* %a1 %10 = bitcast i8** %args to i8* call void @llvm.va_start(i8* %10) %11 = va_arg i8** %args, i32 @@ -93,7 +93,7 @@ define i32 @main() nounwind ssp { %10 = load i32, i32* %a10, align 4 %11 = load i32, i32* %a11, align 4 %12 = load i32, i32* %a12, align 4 - call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12) + call void (i32*, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32* %a1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12) ret i32 0 } diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll index fb52b1d99fc9..6cf0ab35b9b5 100644 --- a/test/CodeGen/AArch64/arm64-abi.ll +++ b/test/CodeGen/AArch64/arm64-abi.ll @@ -205,10 +205,7 @@ declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32, define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind { entry: ; CHECK-LABEL: test8 -; CHECK: strb {{w[0-9]+}}, [sp, #3] -; CHECK: strb wzr, [sp, #2] -; CHECK: strb {{w[0-9]+}}, [sp, #1] -; CHECK: strb wzr, [sp] +; CHECK: str w8, [sp] ; CHECK: bl ; FAST-LABEL: test8 ; FAST: strb {{w[0-9]+}}, [sp] diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll index c57be5684ade..0009fe52e177 100644 --- a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll +++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll @@ -10,14 +10,17 @@ define zeroext i8 @fullGtU(i32 %i1, i32 %i2) { ; CHECK: fullGtU ; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE ; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF] +; CHECK: sxtw [[I1:x[0-9]+]], w0 +; CHECK: sxtw [[I2:x[0-9]+]], w1 ; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]] -; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]], w0, sxtw] -; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw] +; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]], [[I1]]] +; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], [[I2]]] + ; CHECK-NEXT: cmp [[BLOCKVAL1]], [[BLOCKVAL2]] ; CHECK-NEXT: b.ne ; Next BB -; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw -; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw +; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], [[I2]] +; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], [[I1]] ; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1] ; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1] ; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]] diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll index e8fc4e68fcbe..6da767921632 100644 --- a/test/CodeGen/AArch64/arm64-addrmode.ll +++ b/test/CodeGen/AArch64/arm64-addrmode.ll @@ -112,8 +112,8 @@ define void @t10(i64 %a) { define void @t11(i64 %a) { ; CHECK-LABEL: t11: -; CHECK: mov w[[NUM:[0-9]+]], #19070976 -; CHECK: movk w[[NUM:[0-9]+]], #17767 +; CHECK: mov w[[NUM:[0-9]+]], #17767 +; CHECK: movk w[[NUM:[0-9]+]], #291 ; CHECK-NEXT: ldr xzr, [x0, x[[NUM]]] %1 = add i64 %a, 19088743 ;0x1234567 %2 = inttoptr i64 %1 to i64* diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll index c87103481adf..2c9a3bbaa500 100644 --- a/test/CodeGen/AArch64/arm64-atomic.ll +++ b/test/CodeGen/AArch64/arm64-atomic.ll @@ -9,10 +9,10 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 { ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -27,10 +27,12 @@ define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 { ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: mov x0, x[[ADDR]] +; CHECK-NEXT: ret %new = load i32, i32* %pnew %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire %val = extractvalue { i32, i1 } %pair, 0 @@ -41,15 +43,15 @@ define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 { ; CHECK-LABEL: val_compare_and_swap_rel: ; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]: -; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]] +; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]] ; CHECK-NEXT: cmp [[RESULT]], w1 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] -; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]] +; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -64,10 +66,10 @@ define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 { ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]] ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]] -; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]] +; CHECK-NEXT: ret ; CHECK-NEXT: [[FAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[EXITBB]]: +; CHECK-NEXT: ret %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val diff --git a/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/test/CodeGen/AArch64/arm64-bitfield-extract.ll index 339dbbe18fc0..91aed060677a 100644 --- a/test/CodeGen/AArch64/arm64-bitfield-extract.ll +++ b/test/CodeGen/AArch64/arm64-bitfield-extract.ll @@ -348,8 +348,8 @@ entry: ; CHECK-LABEL: fct16: ; CHECK: ldr [[REG1:w[0-9]+]], ; Create the constant -; CHECK: mov [[REGCST:w[0-9]+]], #1703936 -; CHECK: movk [[REGCST]], #33120 +; CHECK: mov [[REGCST:w[0-9]+]], #33120 +; CHECK: movk [[REGCST]], #26, lsl #16 ; Do the masking ; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]] ; CHECK-NEXT: bfxil [[REG2]], w1, #16, #3 @@ -377,8 +377,8 @@ entry: ; CHECK-LABEL: fct17: ; CHECK: ldr [[REG1:x[0-9]+]], ; Create the constant -; CHECK: mov w[[REGCST:[0-9]+]], #1703936 -; CHECK: movk w[[REGCST]], #33120 +; CHECK: mov w[[REGCST:[0-9]+]], #33120 +; CHECK: movk w[[REGCST]], #26, lsl #16 ; Do the masking ; CHECK: and [[REG2:x[0-9]+]], [[REG1]], x[[REGCST]] ; CHECK-NEXT: bfxil [[REG2]], x1, #16, #3 diff --git a/test/CodeGen/AArch64/arm64-blockaddress.ll b/test/CodeGen/AArch64/arm64-blockaddress.ll index 5df840216352..b50ffdef5ddd 100644 --- a/test/CodeGen/AArch64/arm64-blockaddress.ll +++ b/test/CodeGen/AArch64/arm64-blockaddress.ll @@ -15,10 +15,10 @@ entry: ; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1 ; CHECK-LARGE-LABEL: t: -; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]] -; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]] +; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g0_nc:[[DEST_LBL:.Ltmp[0-9]+]] ; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]] -; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]] +; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]] +; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g3:[[DEST_LBL]] %recover = alloca i64, align 8 store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8 diff --git a/test/CodeGen/AArch64/arm64-builtins-linux.ll b/test/CodeGen/AArch64/arm64-builtins-linux.ll index 64239582f230..f86ee1afe555 100644 --- a/test/CodeGen/AArch64/arm64-builtins-linux.ll +++ b/test/CodeGen/AArch64/arm64-builtins-linux.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-fuchsia | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-fuchsia -code-model=kernel | FileCheck --check-prefix=FUCHSIA-KERNEL %s ; Function Attrs: nounwind readnone declare i8* @llvm.thread.pointer() #1 @@ -6,6 +8,8 @@ declare i8* @llvm.thread.pointer() #1 define i8* @thread_pointer() { ; CHECK: thread_pointer: ; CHECK: mrs {{x[0-9]+}}, TPIDR_EL0 +; FUCHSIA-KERNEL: thread_pointer: +; FUCHSIA-KERNEL: mrs {{x[0-9]+}}, TPIDR_EL1 %1 = tail call i8* @llvm.thread.pointer() ret i8* %1 } diff --git a/test/CodeGen/AArch64/arm64-code-model-large-abs.ll b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll index 9f50fea370e4..171941748c8f 100644 --- a/test/CodeGen/AArch64/arm64-code-model-large-abs.ll +++ b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll @@ -9,10 +9,10 @@ define i8* @global_addr() { ; CHECK-LABEL: global_addr: ret i8* @var8 ; The movz/movk calculation should end up returned directly in x0. -; CHECK: movz x0, #:abs_g3:var8 -; CHECK: movk x0, #:abs_g2_nc:var8 +; CHECK: movz x0, #:abs_g0_nc:var8 ; CHECK: movk x0, #:abs_g1_nc:var8 -; CHECK: movk x0, #:abs_g0_nc:var8 +; CHECK: movk x0, #:abs_g2_nc:var8 +; CHECK: movk x0, #:abs_g3:var8 ; CHECK-NEXT: ret } @@ -20,10 +20,10 @@ define i8 @global_i8() { ; CHECK-LABEL: global_i8: %val = load i8, i8* @var8 ret i8 %val -; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8 -; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8 +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var8 ; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8 -; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8 +; CHECK: movk x[[ADDR_REG]], #:abs_g3:var8 ; CHECK: ldrb w0, [x[[ADDR_REG]]] } @@ -31,10 +31,10 @@ define i16 @global_i16() { ; CHECK-LABEL: global_i16: %val = load i16, i16* @var16 ret i16 %val -; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16 -; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16 +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var16 ; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16 -; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16 +; CHECK: movk x[[ADDR_REG]], #:abs_g3:var16 ; CHECK: ldrh w0, [x[[ADDR_REG]]] } @@ -42,10 +42,10 @@ define i32 @global_i32() { ; CHECK-LABEL: global_i32: %val = load i32, i32* @var32 ret i32 %val -; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32 -; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32 +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var32 ; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32 -; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32 +; CHECK: movk x[[ADDR_REG]], #:abs_g3:var32 ; CHECK: ldr w0, [x[[ADDR_REG]]] } @@ -53,10 +53,10 @@ define i64 @global_i64() { ; CHECK-LABEL: global_i64: %val = load i64, i64* @var64 ret i64 %val -; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64 -; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64 +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var64 ; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64 -; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64 +; CHECK: movk x[[ADDR_REG]], #:abs_g3:var64 ; CHECK: ldr x0, [x[[ADDR_REG]]] } @@ -64,9 +64,9 @@ define <2 x i64> @constpool() { ; CHECK-LABEL: constpool: ret <2 x i64> -; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]] -; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]] +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:[[CPADDR:.LCPI[0-9]+_[0-9]+]] ; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]] -; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]] +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]] +; CHECK: movk x[[ADDR_REG]], #:abs_g3:[[CPADDR]] ; CHECK: ldr q0, [x[[ADDR_REG]]] } diff --git a/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll index c9f668f2c424..a104b65ea861 100644 --- a/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll +++ b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll @@ -258,8 +258,7 @@ false: ; => We have one zext of %zextld left and we created one sext of %ld2. ; 2. We try to promote the operand of %sextaddza. ; a. This creates one sext of %zexta and one of %zextld -; b. The sext of %zexta does not lead to any load, it stays here, even if it -; could have been combine with the zext of %a. +; b. The sext of %zexta can be combined with the zext of %a. ; c. The sext of %zextld leads to %ld and can be combined with it. This is ; done by promoting %zextld. This is fine with the current heuristic: ; neutral. @@ -281,16 +280,14 @@ false: ; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %addr1 ; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 ; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 -; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 ; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32, i32* %addr2 ; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64 -; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_1]] -; We do not combine this one: see 2.b. -; OPT-NEXT: [[ZEXTA:%[a-zA-Z_0-9-]+]] = zext i8 %a to i32 -; OPT-NEXT: [[SEXTZEXTA:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXTA]] to i64 -; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTZEXTA]], [[ZEXTLD1_3]] +; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_3]] +; OPT-NEXT: [[ZEXTLD1_4:%[a-zA-Z_0-9-]+]] = zext i8 %a to i64 +; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXTLD1_4]], [[ZEXTLD1_2]] ; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64 -; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_2]] +; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_1]] ; ; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 ; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64 @@ -636,3 +633,24 @@ define i64 @doNotPromoteBecauseOfPairedLoad(i32* %p, i32 %cst) { %final = add i64 %sextres, %zextLd0 ret i64 %final } + +define i64 @promoteZextShl(i1 %c, i16* %P) { +entry: +; OPTALL-LABEL: promoteZextShl +; OPTALL-LABEL: entry: +; OPT: %[[LD:.*]] = load i16, i16* %P +; OPT: %[[EXT:.*]] = zext i16 %[[LD]] to i64 +; OPT-LABEL: if.then: +; OPT: shl nsw i64 %[[EXT]], 1 +; DISABLE-LABEL: if.then: +; DISABLE: %r = sext i32 %shl2 to i64 + %ld = load i16, i16* %P + br i1 %c, label %end, label %if.then +if.then: + %z = zext i16 %ld to i32 + %shl2 = shl nsw i32 %z, 1 + %r = sext i32 %shl2 to i64 + ret i64 %r +end: + ret i64 0 +} diff --git a/test/CodeGen/AArch64/arm64-const-addr.ll b/test/CodeGen/AArch64/arm64-const-addr.ll index e55db2904489..bbb1ce4aced7 100644 --- a/test/CodeGen/AArch64/arm64-const-addr.ll +++ b/test/CodeGen/AArch64/arm64-const-addr.ll @@ -5,8 +5,8 @@ ; Test if the constant base address gets only materialized once. define i32 @test1() nounwind { ; CHECK-LABEL: test1 -; CHECK: mov w8, #68091904 -; CHECK-NEXT: movk w8, #49152 +; CHECK: mov w8, #49152 +; CHECK-NEXT: movk w8, #1039, lsl #16 ; CHECK-NEXT: ldp w9, w10, [x8, #4] ; CHECK: ldr w8, [x8, #12] %at = inttoptr i64 68141056 to %T* diff --git a/test/CodeGen/AArch64/arm64-crc32.ll b/test/CodeGen/AArch64/arm64-crc32.ll index 22111de5a3aa..df9465a6bda5 100644 --- a/test/CodeGen/AArch64/arm64-crc32.ll +++ b/test/CodeGen/AArch64/arm64-crc32.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=arm64-eabi -mattr=+crc -o - %s | FileCheck %s +; RUN: llc -mtriple=arm64-eabi -mcpu=cortex-a53 -mattr=+crc -o - %s | FileCheck %s define i32 @test_crc32b(i32 %cur, i8 %next) { ; CHECK-LABEL: test_crc32b: diff --git a/test/CodeGen/AArch64/arm64-elf-globals.ll b/test/CodeGen/AArch64/arm64-elf-globals.ll index b1d5524aee87..92dc8179f8ea 100644 --- a/test/CodeGen/AArch64/arm64-elf-globals.ll +++ b/test/CodeGen/AArch64/arm64-elf-globals.ll @@ -2,6 +2,10 @@ ; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST ; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC ; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC +; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -o - %s -mcpu=cyclone | FileCheck %s +; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST +; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC +; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC @var8 = external global i8, align 1 @var16 = external global i16, align 2 diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll index f00efbcea780..990782cb69a0 100644 --- a/test/CodeGen/AArch64/arm64-extern-weak.ll +++ b/test/CodeGen/AArch64/arm64-extern-weak.ll @@ -15,10 +15,10 @@ define i32()* @foo() { ; In the large model, the usual relocations are absolute and can ; materialise 0. -; CHECK-LARGE: movz x0, #:abs_g3:var -; CHECK-LARGE: movk x0, #:abs_g2_nc:var +; CHECK-LARGE: movz x0, #:abs_g0_nc:var ; CHECK-LARGE: movk x0, #:abs_g1_nc:var -; CHECK-LARGE: movk x0, #:abs_g0_nc:var +; CHECK-LARGE: movk x0, #:abs_g2_nc:var +; CHECK-LARGE: movk x0, #:abs_g3:var } @@ -33,10 +33,10 @@ define i32* @bar() { ; In the large model, the usual relocations are absolute and can ; materialise 0. -; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var -; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g2_nc:arr_var +; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g0_nc:arr_var ; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g1_nc:arr_var -; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g0_nc:arr_var +; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g2_nc:arr_var +; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g3:arr_var } @defined_weak_var = internal unnamed_addr global i32 0 @@ -46,8 +46,8 @@ define i32* @wibble() { ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var ; CHECK: add x0, [[BASE]], :lo12:defined_weak_var -; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var -; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var +; CHECK-LARGE: movz x0, #:abs_g0_nc:defined_weak_var ; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var -; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var +; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var +; CHECK-LARGE: movk x0, #:abs_g3:defined_weak_var } diff --git a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll index 9dae7a6f5b69..4aa10da7243d 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll @@ -37,9 +37,9 @@ entry: define signext i8 @foo3() nounwind ssp { entry: ; CHECK-LABEL: @foo3 -; CHECK: mov x[[REG:[0-9]+]], #12343736008704 +; CHECK: mov x[[REG:[0-9]+]], #12274 ; CHECK: movk x[[REG]], #29646, lsl #16 -; CHECK: movk x[[REG]], #12274 +; CHECK: movk x[[REG]], #2874, lsl #32 %0 = load i8*, i8** @pd2, align 8 %arrayidx = getelementptr inbounds i8, i8* %0, i64 12345678901234 %1 = load i8, i8* %arrayidx, align 1 diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll index 071b2d0dbca4..a502800923fd 100644 --- a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -6216,11 +6216,11 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(i16* %bar, i16** %pt declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) ; CHECK-LABEL: test_ld1lane_build: -; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[0], [x0] -; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[1], [x1] -; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[0], [x2] -; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[1], [x3] -; CHECK: sub.2s v[[REGNUM2:[0-9]+]], [[REG0]], [[REG1]] +; CHECK-DAG: ldr s[[REGNUM0:[0-9]+]], [x0] +; CHECK-DAG: ld1.s { v[[REGNUM0:[0-9]+]] }[1], [x1] +; CHECK-DAG: ldr s[[REGNUM1:[0-9]+]], [x2] +; CHECK-DAG: ld1.s { v[[REGNUM1:[0-9]+]] }[1], [x3] +; CHECK: sub.2s v[[REGNUM2:[0-9]+]], v[[REGNUM0]], v[[REGNUM1]] ; CHECK-NEXT: str d[[REGNUM2]], [x4] ; CHECK-NEXT: ret define void @test_ld1lane_build(i32* %ptr0, i32* %ptr1, i32* %ptr2, i32* %ptr3, <2 x i32>* %out) { @@ -6238,3 +6238,84 @@ define void @test_ld1lane_build(i32* %ptr0, i32* %ptr1, i32* %ptr2, i32* %ptr3, store <2 x i32> %sub, <2 x i32>* %out, align 16 ret void } + +; CHECK-LABEL: test_ld1lane_build_i16: +; CHECK-DAG: ldr h[[REGNUM1:[0-9]+]], [x0] +; CHECK-DAG: ld1.h { v[[REGNUM1]] }[1], [x1] +; CHECK-DAG: ld1.h { v[[REGNUM1]] }[2], [x2] +; CHECK-DAG: ld1.h { v[[REGNUM1]] }[3], [x3] +; CHECK: sub.4h v[[REGNUM2:[0-9]+]], v[[REGNUM1]], v0 +; CHECK-NEXT: str d[[REGNUM2]], [x4] +; CHECK-NEXT: ret +define void @test_ld1lane_build_i16(i16* %a, i16* %b, i16* %c, i16* %d, <4 x i16> %e, <4 x i16>* %p) { + %ld.a = load i16, i16* %a + %ld.b = load i16, i16* %b + %ld.c = load i16, i16* %c + %ld.d = load i16, i16* %d + %v.a = insertelement <4 x i16> undef, i16 %ld.a, i64 0 + %v.b = insertelement <4 x i16> %v.a, i16 %ld.b, i64 1 + %v.c = insertelement <4 x i16> %v.b, i16 %ld.c, i64 2 + %v = insertelement <4 x i16> %v.c, i16 %ld.d, i64 3 + %sub = sub nsw <4 x i16> %v, %e + store <4 x i16> %sub, <4 x i16>* %p + ret void +} + +; CHECK-LABEL: test_ld1lane_build_half: +; CHECK-DAG: ldr h[[REGNUM1:[0-9]+]], [x0] +; CHECK-DAG: ld1.h { v[[REGNUM1]] }[1], [x1] +; CHECK-DAG: ld1.h { v[[REGNUM1]] }[2], [x2] +; CHECK-DAG: ld1.h { v[[REGNUM1]] }[3], [x3] +; CHECK-DAG: fcvtl v[[REGNUM01:[0-9]+]].4s, v0.4h +; CHECK-DAG: fcvtl v[[REGNUM11:[0-9]+]].4s, v[[REGNUM1]].4h +; CHECK: fsub.4s v[[REGNUM2:[0-9]+]], v[[REGNUM11]], v[[REGNUM01]] +; CHECK-DAG: fcvtn v[[REGNUM3:[0-9]+]].4h, v[[REGNUM2]].4s +; CHECK-NEXT: str d[[REGNUM2]], [x4] +; CHECK-NEXT: ret +define void @test_ld1lane_build_half(half* %a, half* %b, half* %c, half* %d, <4 x half> %e, <4 x half>* %p) { + %ld.a = load half, half* %a + %ld.b = load half, half* %b + %ld.c = load half, half* %c + %ld.d = load half, half* %d + %v.a = insertelement <4 x half> undef, half %ld.a, i64 0 + %v.b = insertelement <4 x half> %v.a, half %ld.b, i64 1 + %v.c = insertelement <4 x half> %v.b, half %ld.c, i64 2 + %v = insertelement <4 x half> %v.c, half %ld.d, i64 3 + %sub = fsub <4 x half> %v, %e + store <4 x half> %sub, <4 x half>* %p + ret void +} + +; CHECK-LABEL: test_ld1lane_build_i8: +; CHECK-DAG: ldr b[[REGNUM1:[0-9]+]], [x0] +; CHECK-DAG: ld1.b { v[[REGNUM1]] }[1], [x1] +; CHECK-DAG: ld1.b { v[[REGNUM1]] }[2], [x2] +; CHECK-DAG: ld1.b { v[[REGNUM1]] }[3], [x3] +; CHECK-DAG: ld1.b { v[[REGNUM1]] }[4], [x4] +; CHECK-DAG: ld1.b { v[[REGNUM1]] }[5], [x5] +; CHECK-DAG: ld1.b { v[[REGNUM1]] }[6], [x6] +; CHECK-DAG: ld1.b { v[[REGNUM1]] }[7], [x7] +; CHECK: sub.8b v[[REGNUM2:[0-9]+]], v[[REGNUM1]], v0 +; CHECK-NEXT: str d[[REGNUM2]], [x +; CHECK-NEXT: ret +define void @test_ld1lane_build_i8(i8* %a, i8* %b, i8* %c, i8* %d, i8* %e, i8* %f, i8* %g, i8* %h, <8 x i8> %v, <8 x i8>* %p) { + %ld.a = load i8, i8* %a + %ld.b = load i8, i8* %b + %ld.c = load i8, i8* %c + %ld.d = load i8, i8* %d + %ld.e = load i8, i8* %e + %ld.f = load i8, i8* %f + %ld.g = load i8, i8* %g + %ld.h = load i8, i8* %h + %v.a = insertelement <8 x i8> undef, i8 %ld.a, i64 0 + %v.b = insertelement <8 x i8> %v.a, i8 %ld.b, i64 1 + %v.c = insertelement <8 x i8> %v.b, i8 %ld.c, i64 2 + %v.d = insertelement <8 x i8> %v.c, i8 %ld.d, i64 3 + %v.e = insertelement <8 x i8> %v.d, i8 %ld.e, i64 4 + %v.f = insertelement <8 x i8> %v.e, i8 %ld.f, i64 5 + %v.g = insertelement <8 x i8> %v.f, i8 %ld.g, i64 6 + %v1 = insertelement <8 x i8> %v.g, i8 %ld.h, i64 7 + %sub = sub nsw <8 x i8> %v1, %v + store <8 x i8> %sub, <8 x i8>* %p + ret void +} diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll index f3f359380440..f28d0ab07c5a 100644 --- a/test/CodeGen/AArch64/arm64-inline-asm.ll +++ b/test/CodeGen/AArch64/arm64-inline-asm.ll @@ -236,14 +236,14 @@ define void @test_zero_reg(i32* %addr) { define <2 x float> @test_vreg_64bit(<2 x float> %in) nounwind { ; CHECK-LABEL: test_vreg_64bit: %1 = tail call <2 x float> asm sideeffect "fadd ${0}.2s, ${1}.2s, ${1}.2s", "={v14},w"(<2 x float> %in) nounwind - ; CHECK fadd v14.2s, v0.2s, v0.2s: + ; CHECK: fadd v14.2s, v0.2s, v0.2s ret <2 x float> %1 } define <4 x float> @test_vreg_128bit(<4 x float> %in) nounwind { ; CHECK-LABEL: test_vreg_128bit: %1 = tail call <4 x float> asm sideeffect "fadd ${0}.4s, ${1}.4s, ${1}.4s", "={v14},w"(<4 x float> %in) nounwind - ; CHECK fadd v14.4s, v0.4s, v0.4s: + ; CHECK: fadd v14.4s, v0.4s, v0.4s ret <4 x float> %1 } diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll index 8f22f97ca087..384aaa8541df 100644 --- a/test/CodeGen/AArch64/arm64-memset-inline.ll +++ b/test/CodeGen/AArch64/arm64-memset-inline.ll @@ -13,8 +13,8 @@ define void @t2() nounwind ssp { entry: ; CHECK-LABEL: t2: ; CHECK: strh wzr, [sp, #32] -; CHECK: stp xzr, xzr, [sp, #16] -; CHECK: str xzr, [sp, #8] +; CHECK: stp xzr, xzr, [sp, #8] +; CHECK: str xzr, [sp, #24] %buf = alloca [26 x i8], align 1 %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false) diff --git a/test/CodeGen/AArch64/arm64-movi.ll b/test/CodeGen/AArch64/arm64-movi.ll index c24490665d62..8d6caa81d978 100644 --- a/test/CodeGen/AArch64/arm64-movi.ll +++ b/test/CodeGen/AArch64/arm64-movi.ll @@ -51,24 +51,24 @@ define i32 @movz() nounwind { define i64 @movz_3movk() nounwind { ; CHECK-LABEL: movz_3movk: -; CHECK: mov x0, #1407374883553280 -; CHECK-NEXT: movk x0, #4660, lsl #32 +; CHECK: mov x0, #22136 ; CHECK-NEXT: movk x0, #43981, lsl #16 -; CHECK-NEXT: movk x0, #22136 +; CHECK-NEXT: movk x0, #4660, lsl #32 +; CHECK-NEXT: movk x0, #5, lsl #48 ret i64 1427392313513592 } define i64 @movz_movk_skip1() nounwind { ; CHECK-LABEL: movz_movk_skip1: -; CHECK: mov x0, #21474836480 -; CHECK-NEXT: movk x0, #17185, lsl #16 +; CHECK: mov x0, #1126236160 +; CHECK-NEXT: movk x0, #5, lsl #32 ret i64 22601072640 } define i64 @movz_skip1_movk() nounwind { ; CHECK-LABEL: movz_skip1_movk: -; CHECK: mov x0, #147695335374848 -; CHECK-NEXT: movk x0, #4660 +; CHECK: mov x0, #4660 +; CHECK-NEXT: movk x0, #34388, lsl #32 ret i64 147695335379508 } @@ -84,8 +84,8 @@ define i64 @movn() nounwind { define i64 @movn_skip1_movk() nounwind { ; CHECK-LABEL: movn_skip1_movk: -; CHECK: mov x0, #-176093659137 -; CHECK-NEXT: movk x0, #4660 +; CHECK: mov x0, #-60876 +; CHECK-NEXT: movk x0, #65494, lsl #32 ret i64 -176093720012 } @@ -195,8 +195,8 @@ define i64 @orr_movk13() nounwind { ; rdar://13944082 define i64 @g() nounwind { ; CHECK-LABEL: g: -; CHECK: mov x0, #-281474976710656 -; CHECK: movk x0, #2 +; CHECK: mov x0, #2 +; CHECK: movk x0, #65535, lsl #48 entry: ret i64 -281474976710654 } diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll index 8d9a8c06aa3c..a7b95e717910 100644 --- a/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -906,7 +906,7 @@ define <8 x i8> @getl(<16 x i8> %x) #0 { ; CHECK: str q0 ; CHECK-DAG: and [[MASKED_IDX:x[0-9]+]], x0, #0x7 ; CHECK: bfi [[PTR:x[0-9]+]], [[MASKED_IDX]], #1, #3 -; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], {{\[}}[[PTR]]{{\]}} +; CHECK-DAG: ldr h[[R:[0-9]+]], {{\[}}[[PTR]]{{\]}} ; CHECK-DAG: ins v[[R]].h[1], v0.h[1] ; CHECK-DAG: ins v[[R]].h[2], v0.h[2] ; CHECK-DAG: ins v[[R]].h[3], v0.h[3] diff --git a/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll index 45dba479ccc4..ae087ab8cf05 100644 --- a/test/CodeGen/AArch64/arm64-neon-v8.1a.ll +++ b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple diff --git a/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll b/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll new file mode 100644 index 000000000000..7efb4bf6d596 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll @@ -0,0 +1,63 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -pass-remarks-analysis=asm-printer \ +; RUN: -pass-remarks-with-hotness=1 -asm-verbose=0 \ +; RUN: -debug-only=lazy-machine-block-freq,block-freq \ +; RUN: -debug-pass=Executions 2>&1 | FileCheck %s -check-prefix=HOTNESS + +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -pass-remarks-analysis=asm-printer \ +; RUN: -pass-remarks-with-hotness=0 -asm-verbose=0 \ +; RUN: -debug-only=lazy-machine-block-freq,block-freq \ +; RUN: -debug-pass=Executions 2>&1 | FileCheck %s -check-prefix=NO_HOTNESS + +; REQUIRES: asserts + + +; Verify that we don't new populate MachineBFI for passes that already use +; MBFI, e.g. GreedyRegAlloc. (This hard-codes the previous pass to the +; GreedyRegAlloc, please adjust accordingly.) + +; HOTNESS: Executing Pass 'Spill Code Placement Analysis' +; HOTNESS-NEXT: Executing Pass 'Lazy Machine Block Frequency Analysis' +; HOTNESS-NEXT: Executing Pass 'Machine Optimization Remark Emitter' +; HOTNESS-NEXT: MachineBlockFrequencyInfo is available +; HOTNESS-NEXT: Executing Pass 'Greedy Register Allocator' + + +; Verify that we only populate MachineBFI on behalf of ORE when hotness is +; requested. (This hard-codes the previous pass to the Assembly Printer, +; please adjust accordingly.) + +; HOTNESS: Executing Pass 'Implement the 'patchable-function' attribute' +; HOTNESS-NEXT: Freeing Pass 'Implement the 'patchable-function' attribute' +; HOTNESS-NEXT: Executing Pass 'Lazy Machine Block Frequency Analysis' +; HOTNESS-NEXT: Executing Pass 'Machine Optimization Remark Emitter' +; HOTNESS-NEXT: Building MachineBlockFrequencyInfo on the fly +; HOTNESS-NEXT: Building LoopInfo on the fly +; HOTNESS-NEXT: Building DominatorTree on the fly +; HOTNESS-NOT: Executing Pass +; HOTNESS: block-frequency: empty_func +; HOTNESS-NOT: Executing Pass +; HOTNESS: Executing Pass 'AArch64 Assembly Printer' + +; HOTNESS: arm64-summary-remarks.ll:5:0: 1 instructions in function (hotness: 33) + + +; NO_HOTNESS: Executing Pass 'Implement the 'patchable-function' attribute' +; NO_HOTNESS-NEXT: Freeing Pass 'Implement the 'patchable-function' attribute' +; NO_HOTNESS-NEXT: Executing Pass 'Lazy Machine Block Frequency Analysis' +; NO_HOTNESS-NEXT: Executing Pass 'Machine Optimization Remark Emitter' +; NO_HOTNESS-NEXT: Executing Pass 'AArch64 Assembly Printer' + +; NO_HOTNESS: arm64-summary-remarks.ll:5:0: 1 instructions in function{{$}} + +define void @empty_func() nounwind ssp !dbg !3 !prof !4 { + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1) +!1 = !DIFile(filename: "arm64-summary-remarks.ll", directory: "") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "empty_func", scope: !1, file: !1, line: 5, scopeLine: 5, unit: !0) +!4 = !{!"function_entry_count", i64 33} diff --git a/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir b/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir index bda025af5193..9ad47c721c3a 100644 --- a/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir +++ b/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s | FileCheck %s # CHECK: %1 = ANDWri {{.*}} # CHECK-NEXT: %wzr = SUBSWri {{.*}} --- | diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll index 255cd8e4a0d3..4df220eddbbb 100644 --- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -346,19 +346,15 @@ entry: ; CHECK-NEXT: sub w1, w1, #1 ; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]] ; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]] -; DISABLE-NEXT: b [[IFEND_LABEL]] -; -; DISABLE: [[ELSE_LABEL]]: ; %if.else -; DISABLE: lsl w0, w1, #1 -; -; CHECK: [[IFEND_LABEL]]: +; CHECK-NEXT: [[IFEND_LABEL]]: ; Epilogue code. ; CHECK: add sp, sp, #16 ; CHECK-NEXT: ret ; -; ENABLE: [[ELSE_LABEL]]: ; %if.else -; ENABLE-NEXT: lsl w0, w1, #1 -; ENABLE_NEXT: ret +; CHECK: [[ELSE_LABEL]]: ; %if.else +; CHECK-NEXT: lsl w0, w1, #1 +; DISABLE-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 { entry: %ap = alloca i8*, align 8 diff --git a/test/CodeGen/AArch64/arm64-spill-remarks.ll b/test/CodeGen/AArch64/arm64-spill-remarks.ll new file mode 100644 index 000000000000..bc9340352d75 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-spill-remarks.ll @@ -0,0 +1,117 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple -pass-remarks-missed=regalloc 2>&1 | FileCheck -check-prefix=REMARK %s +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple -pass-remarks-missed=regalloc -pass-remarks-with-hotness 2>&1 | FileCheck -check-prefix=HOTNESS %s +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple 2>&1 | FileCheck -check-prefix=NO_REMARK %s +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple -pass-remarks-output=%t.yaml -pass-remarks-with-hotness 2>&1 | FileCheck -check-prefix=NO_REMARK %s +; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s + +; This has two nested loops, each with one value that has to be spilled and +; then reloaded. + +; (loop3:) +; REMARK: remark: /tmp/kk.c:3:20: 1 spills 1 reloads generated in loop{{$}} +; (loop2:) +; REMARK: remark: /tmp/kk.c:2:20: 1 spills 1 reloads generated in loop{{$}} +; (loop:) +; REMARK: remark: /tmp/kk.c:1:20: 2 spills 2 reloads generated in loop{{$}} + +; (loop3:) +; HOTNESS: remark: /tmp/kk.c:3:20: 1 spills 1 reloads generated in loop (hotness: 300) +; (loop2:) +; HOTNESS: remark: /tmp/kk.c:2:20: 1 spills 1 reloads generated in loop (hotness: 30000) +; (loop:) +; HOTNESS: remark: /tmp/kk.c:1:20: 2 spills 2 reloads generated in loop (hotness: 300) + +; NO_REMARK-NOT: remark + +; YAML: --- !Missed +; YAML: Pass: regalloc +; YAML: Name: LoopSpillReload +; YAML: DebugLoc: { File: /tmp/kk.c, Line: 3, Column: 20 } +; YAML: Function: fpr128 +; YAML: Hotness: 300 +; YAML: Args: +; YAML: - NumSpills: '1' +; YAML: - String: ' spills ' +; YAML: - NumReloads: '1' +; YAML: - String: ' reloads ' +; YAML: - String: generated in loop +; YAML: ... +; YAML: --- !Missed +; YAML: Pass: regalloc +; YAML: Name: LoopSpillReload +; YAML: DebugLoc: { File: /tmp/kk.c, Line: 2, Column: 20 } +; YAML: Function: fpr128 +; YAML: Hotness: 30000 +; YAML: Args: +; YAML: - NumSpills: '1' +; YAML: - String: ' spills ' +; YAML: - NumReloads: '1' +; YAML: - String: ' reloads ' +; YAML: - String: generated in loop +; YAML: ... +; YAML: --- !Missed +; YAML: Pass: regalloc +; YAML: Name: LoopSpillReload +; YAML: DebugLoc: { File: /tmp/kk.c, Line: 1, Column: 20 } +; YAML: Function: fpr128 +; YAML: Hotness: 300 +; YAML: Args: +; YAML: - NumSpills: '2' +; YAML: - String: ' spills ' +; YAML: - NumReloads: '2' +; YAML: - String: ' reloads ' +; YAML: - String: generated in loop +; YAML: ... + +define void @fpr128(<4 x float>* %p) nounwind ssp !prof !11 { +entry: + br label %loop, !dbg !8 + +loop: + %i = phi i32 [ 0, %entry], [ %i.2, %end2 ] + br label %loop2, !dbg !9 + +loop2: + %j = phi i32 [ 0, %loop], [ %j.2, %loop2 ] + call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind + %j.2 = add i32 %j, 1 + %c2 = icmp slt i32 %j.2, 100 + br i1 %c2, label %loop2, label %end2, !prof !12 + +end2: + call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind + %i.2 = add i32 %i, 1 + %c = icmp slt i32 %i.2, 100 + br i1 %c, label %loop, label %end, !prof !12 + +end: + br label %loop3 + +loop3: + %k = phi i32 [ 0, %end], [ %k.2, %loop3 ] + call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind + %k.2 = add i32 %k, 1 + %c3 = icmp slt i32 %k.2, 100 + br i1 %c3, label %loop3, label %end3, !dbg !10, !prof !12 + +end3: + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!llvm.ident = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2) +!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"PIC Level", i32 2} +!5 = !{!"clang version 3.9.0 "} +!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2) +!7 = !DISubroutineType(types: !2) +!8 = !DILocation(line: 1, column: 20, scope: !6) +!9 = !DILocation(line: 2, column: 20, scope: !6) +!10 = !DILocation(line: 3, column: 20, scope: !6) +!11 = !{!"function_entry_count", i64 3} +!12 = !{!"branch_weights", i32 99, i32 1} diff --git a/test/CodeGen/AArch64/arm64-summary-remarks.ll b/test/CodeGen/AArch64/arm64-summary-remarks.ll new file mode 100644 index 000000000000..70e7fdffd63d --- /dev/null +++ b/test/CodeGen/AArch64/arm64-summary-remarks.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -pass-remarks-analysis=asm-printer 2>&1 | FileCheck %s + +; CHECK: arm64-summary-remarks.ll:5:0: 1 instructions in function + +define void @empty_func() nounwind ssp !dbg !3 { + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1) +!1 = !DIFile(filename: "arm64-summary-remarks.ll", directory: "") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "empty_func", scope: !1, file: !1, line: 5, scopeLine: 5, unit: !0) diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll index 16ddf690fe95..375877c51798 100644 --- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -99,7 +99,7 @@ define void @test_nospare([8 x i64], [8 x float], ...) { ; __stack field should point just past them. define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) { ; CHECK-LABEL: test_offsetstack: -; CHECK: sub sp, sp, #80 +; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #-80]! ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96 ; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var ; CHECK: str [[STACK_TOP]], [x[[VAR]]] diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll index 735be244d457..42b0051a2dd6 100644 --- a/test/CodeGen/AArch64/bitfield-insert.ll +++ b/test/CodeGen/AArch64/bitfield-insert.ll @@ -428,8 +428,8 @@ define i32 @test5(i32 %a) { ; BFXIL will use the same constant as the ORR, so we don't care how the constant ; is materialized (it's an equal cost either way). ; CHECK-LABEL: @test6 -; CHECK: mov [[REG:w[0-9]+]], #720896 -; CHECK: movk [[REG]], #23250 +; CHECK: mov [[REG:w[0-9]+]], #23250 +; CHECK: movk [[REG]], #11, lsl #16 ; CHECK: bfxil w0, [[REG]], #0, #20 define i32 @test6(i32 %a) { %1 = and i32 %a, 4293918720 ; 0xfff00000 @@ -440,8 +440,8 @@ define i32 @test6(i32 %a) { ; BFIs that require the same number of instruction to materialize the constant ; as the original ORR are okay. ; CHECK-LABEL: @test7 -; CHECK: mov [[REG:w[0-9]+]], #327680 -; CHECK: movk [[REG]], #44393 +; CHECK: mov [[REG:w[0-9]+]], #44393 +; CHECK: movk [[REG]], #5, lsl #16 ; CHECK: bfi w0, [[REG]], #1, #19 define i32 @test7(i32 %a) { %1 = and i32 %a, 4293918721 ; 0xfff00001 @@ -454,9 +454,9 @@ define i32 @test7(i32 %a) { ; 'and' with a 'movk', which would decrease ILP while using the same number of ; instructions. ; CHECK-LABEL: @test8 -; CHECK: mov [[REG2:x[0-9]+]], #157599529959424 +; CHECK: mov [[REG2:x[0-9]+]], #2035482624 ; CHECK: and [[REG1:x[0-9]+]], x0, #0xff000000000000ff -; CHECK: movk [[REG2]], #31059, lsl #16 +; CHECK: movk [[REG2]], #36694, lsl #32 ; CHECK: orr x0, [[REG1]], [[REG2]] define i64 @test8(i64 %a) { %1 = and i64 %a, -72057594037927681 ; 0xff000000000000ff diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll index 7c0755a13d0e..3683332c2c64 100644 --- a/test/CodeGen/AArch64/blockaddress.ll +++ b/test/CodeGen/AArch64/blockaddress.ll @@ -14,10 +14,10 @@ define void @test_blockaddress() { ; CHECK: ldr [[NEWDEST:x[0-9]+]] ; CHECK: br [[NEWDEST]] -; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]] -; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]] +; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g0_nc:[[DEST_LBL:.Ltmp[0-9]+]] ; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]] -; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]] +; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]] +; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g3:[[DEST_LBL]] ; CHECK-LARGE: str [[ADDR_REG]], ; CHECK-LARGE: ldr [[NEWDEST:x[0-9]+]] ; CHECK-LARGE: br [[NEWDEST]] diff --git a/test/CodeGen/AArch64/br-cond-not-merge.ll b/test/CodeGen/AArch64/br-cond-not-merge.ll new file mode 100644 index 000000000000..bf21ef307905 --- /dev/null +++ b/test/CodeGen/AArch64/br-cond-not-merge.ll @@ -0,0 +1,94 @@ +; RUN: llc -mtriple=aarch64 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s +; RUN: llc -mtriple=aarch64 -verify-machineinstrs -O0 -fast-isel=0 < %s | FileCheck --check-prefix=CHECK --check-prefix=NOOPT %s + +declare void @foo() + +; Check that the inverted or doesn't inhibit the splitting of the +; complex conditional into three branch instructions. +; CHECK-LABEL: test_and_not: +; CHECK: cbz w0, [[L:\.LBB[0-9_]+]] +; OPT: cmp w1, #2 +; NOOPT: subs w{{[0-9]+}}, w{{[0-9]+}}, #2 +; CHECK: b.lo [[L]] +; OPT: cmp w2, #2 +; NOOPT: subs w{{[0-9]+}}, w{{[0-9]+}}, #2 +; CHECK: b.hi [[L]] +define void @test_and_not(i32 %a, i32 %b, i32 %c) { +bb1: + %cmp1 = icmp ult i32 %a, 1 + %cmp2 = icmp ult i32 %b, 2 + %cmp3 = icmp ult i32 %c, 3 + %or = or i1 %cmp1, %cmp2 + %not.or = xor i1 %or, -1 + %and = and i1 %not.or, %cmp3 + br i1 %and, label %bb2, label %bb3 + +bb2: + ret void + +bb3: + call void @foo() + ret void +} + +; Check that non-canonicalized xor not is handled correctly by FindMergedConditions. +; CHECK-LABEL: test_and_not2: +; CHECK: cbz w0, [[L:\.LBB[0-9_]+]] +; OPT: cmp w1, #2 +; NOOPT: subs w{{[0-9]+}}, w{{[0-9]+}}, #2 +; CHECK: b.lo [[L]] +; OPT: cmp w2, #2 +; NOOPT: subs w{{[0-9]+}}, w{{[0-9]+}}, #2 +; CHECK: b.hi [[L]] +define void @test_and_not2(i32 %a, i32 %b, i32 %c) { +bb1: + %cmp1 = icmp ult i32 %a, 1 + %cmp2 = icmp ult i32 %b, 2 + %cmp3 = icmp ult i32 %c, 3 + %or = or i1 %cmp1, %cmp2 + %not.or = xor i1 -1, %or + %and = and i1 %not.or, %cmp3 + br i1 %and, label %bb2, label %bb3 + +bb2: + ret void + +bb3: + call void @foo() + ret void +} + +; Check that cmps in different blocks are handled correctly by FindMergedConditions. +; CHECK-LABEL: test_cmp_other_block: +; OPT: cmp w{{[0-9]+}}, #0 +; OPT: b.gt [[L:\.LBB[0-9_]+]] +; OPT: tbz w1, #0, [[L]] +; +; NOOPT: subs w{{[0-9]+}}, w{{[0-9]+}}, #0 +; NOOPT: cset [[R1:w[0-9]+]], gt +; NOOPT: str w1, [sp, #[[SLOT2:[0-9]+]]] +; NOOPT: str [[R1]], [sp, #[[SLOT1:[0-9]+]]] +; NOOPT: b .LBB +; NOOPT: ldr [[R2:w[0-9]+]], [sp, #[[SLOT1]]] +; NOOPT: tbnz [[R2]], #0, [[L:\.LBB[0-9_]+]] +; NOOPT: ldr [[R3:w[0-9]+]], [sp, #[[SLOT2]]] +; NOOPT: tbz [[R3]], #0, [[L]] +define void @test_cmp_other_block(i32* %p, i1 %c) { +entry: + %l = load i32, i32* %p + %cmp = icmp sgt i32 %l, 0 + br label %bb1 + +bb1: + %cmp.i = xor i1 %cmp, true + %or.cond1.i = and i1 %cmp.i, %c + br i1 %or.cond1.i, label %bb2, label %bb3 + +bb2: + ret void + +bb3: + call void @foo() + ret void +} + diff --git a/test/CodeGen/AArch64/branch-relax-cbz.ll b/test/CodeGen/AArch64/branch-relax-cbz.ll index c654b94e49cf..d13c0f677bcb 100644 --- a/test/CodeGen/AArch64/branch-relax-cbz.ll +++ b/test/CodeGen/AArch64/branch-relax-cbz.ll @@ -6,23 +6,22 @@ ; CHECK-NEXT: ; BB#1: ; %b3 ; CHECK: ldr [[LOAD:w[0-9]+]] -; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]] -; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]] - -; CHECK-NEXT: [[SKIP_LONG_B]]: +; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]] ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: [[B8]]: ; %b8 +; CHECK-NEXT: ret + ; CHECK-NEXT: [[B2]]: ; %b2 ; CHECK: mov w{{[0-9]+}}, #93 ; CHECK: bl _extfunc ; CHECK: cbz w{{[0-9]+}}, [[B7]] - -; CHECK-NEXT: [[B8]]: ; %b8 -; CHECK-NEXT: ret +; CHECK-NEXT: b [[B8]] ; CHECK-NEXT: [[B7]]: ; %b7 ; CHECK: mov w{{[0-9]+}}, #13 ; CHECK: b _extfunc + define void @split_block_no_fallthrough(i64 %val) #0 { bb: %c0 = icmp sgt i64 %val, -5 diff --git a/test/CodeGen/AArch64/code-model-large-abs.ll b/test/CodeGen/AArch64/code-model-large-abs.ll index 1680815d93ea..82169acc3e56 100644 --- a/test/CodeGen/AArch64/code-model-large-abs.ll +++ b/test/CodeGen/AArch64/code-model-large-abs.ll @@ -9,10 +9,10 @@ define i8* @global_addr() { ; CHECK-LABEL: global_addr: ret i8* @var8 ; The movz/movk calculation should end up returned directly in x0. -; CHECK: movz x0, #:abs_g3:var8 -; CHECK: movk x0, #:abs_g2_nc:var8 +; CHECK: movz x0, #:abs_g0_nc:var8 ; CHECK: movk x0, #:abs_g1_nc:var8 -; CHECK: movk x0, #:abs_g0_nc:var8 +; CHECK: movk x0, #:abs_g2_nc:var8 +; CHECK: movk x0, #:abs_g3:var8 ; CHECK-NEXT: ret } @@ -20,10 +20,10 @@ define i8 @global_i8() { ; CHECK-LABEL: global_i8: %val = load i8, i8* @var8 ret i8 %val -; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8 -; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8 +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var8 ; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8 -; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8 +; CHECK: movk x[[ADDR_REG]], #:abs_g3:var8 ; CHECK: ldrb w0, [x[[ADDR_REG]]] } @@ -31,10 +31,10 @@ define i16 @global_i16() { ; CHECK-LABEL: global_i16: %val = load i16, i16* @var16 ret i16 %val -; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16 -; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16 +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var16 ; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16 -; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16 +; CHECK: movk x[[ADDR_REG]], #:abs_g3:var16 ; CHECK: ldrh w0, [x[[ADDR_REG]]] } @@ -42,10 +42,10 @@ define i32 @global_i32() { ; CHECK-LABEL: global_i32: %val = load i32, i32* @var32 ret i32 %val -; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32 -; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32 +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var32 ; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32 -; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32 +; CHECK: movk x[[ADDR_REG]], #:abs_g3:var32 ; CHECK: ldr w0, [x[[ADDR_REG]]] } @@ -53,9 +53,9 @@ define i64 @global_i64() { ; CHECK-LABEL: global_i64: %val = load i64, i64* @var64 ret i64 %val -; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64 -; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64 +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var64 ; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64 -; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64 +; CHECK: movk x[[ADDR_REG]], #:abs_g3:var64 ; CHECK: ldr x0, [x[[ADDR_REG]]] } diff --git a/test/CodeGen/AArch64/concat_vector-scalar-combine.ll b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll index 1c64af636cb3..3abb14241ea0 100644 --- a/test/CodeGen/AArch64/concat_vector-scalar-combine.ll +++ b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll @@ -38,7 +38,7 @@ entry: define <8 x i8> @test_concat_scalars_2x_v2i8_to_v8i8(i32 %x, i32 %y) #0 { entry: ; CHECK-LABEL: test_concat_scalars_2x_v2i8_to_v8i8: -; CHECK-NEXT: ins.h v0[0], w0 +; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ins.h v0[1], w1 ; CHECK-NEXT: ins.h v0[3], w1 ; CHECK-NEXT: ret @@ -84,7 +84,7 @@ define <8 x i8> @test_concat_scalars_mixed_2x_v2i8_to_v8i8(float %dummy, i32 %x, entry: ; CHECK-LABEL: test_concat_scalars_mixed_2x_v2i8_to_v8i8: ; CHECK-NEXT: fmov s[[X:[0-9]+]], w0 -; CHECK-NEXT: ins.h v0[0], v[[X]][0] +; CHECK-NEXT: mov.16b v0, v[[X]] ; CHECK-NEXT: ins.h v0[1], v1[0] ; CHECK-NEXT: ins.h v0[2], v[[X]][0] ; CHECK-NEXT: ins.h v0[3], v1[0] @@ -99,7 +99,7 @@ entry: define <2 x float> @test_concat_scalars_fp_2x_v2i8_to_v8i8(float %dummy, half %x, half %y) #0 { entry: ; CHECK-LABEL: test_concat_scalars_fp_2x_v2i8_to_v8i8: -; CHECK-NEXT: ins.h v0[0], v1[0] +; CHECK-NEXT: mov.16b v0, v1 ; CHECK-NEXT: ins.h v0[1], v2[0] ; CHECK-NEXT: ins.h v0[2], v1[0] ; CHECK-NEXT: ins.h v0[3], v2[0] diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll index 50685cf5d343..f65144def245 100644 --- a/test/CodeGen/AArch64/cpus.ll +++ b/test/CodeGen/AArch64/cpus.ll @@ -12,7 +12,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=exynos-m3 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=falkor 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=kryo 2>&1 | FileCheck %s -; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=vulcan 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=thunderx2t99 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID ; CHECK-NOT: {{.*}} is not a recognized processor for this target diff --git a/test/CodeGen/AArch64/dag-numsignbits.ll b/test/CodeGen/AArch64/dag-numsignbits.ll new file mode 100644 index 000000000000..217c3df77c9c --- /dev/null +++ b/test/CodeGen/AArch64/dag-numsignbits.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -mtriple=aarch64-unknown | FileCheck %s + +; PR32273 + +define void @signbits_vXi1(<4 x i16> %a1) { +; CHECK-LABEL: signbits_vXi1 +; CHECK: cmgt v0.4h, v1.4h, v0.4h +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: umov w3, v0.h[3] +; CHECK-NEXT: mov w1, wzr +; CHECK-NEXT: mov w2, wzr +; CHECK-NEXT: b foo + %tmp3 = shufflevector <4 x i16> %a1, <4 x i16> undef, <4 x i32> zeroinitializer + %tmp5 = add <4 x i16> %tmp3, + %tmp6 = icmp slt <4 x i16> %tmp5, + %tmp7 = and <4 x i1> %tmp6, + %tmp8 = sext <4 x i1> %tmp7 to <4 x i16> + %tmp9 = extractelement <4 x i16> %tmp8, i32 0 + %tmp10 = zext i16 %tmp9 to i32 + %tmp11 = extractelement <4 x i16> %tmp8, i32 1 + %tmp12 = zext i16 %tmp11 to i32 + %tmp13 = extractelement <4 x i16> %tmp8, i32 2 + %tmp14 = zext i16 %tmp13 to i32 + %tmp15 = extractelement <4 x i16> %tmp8, i32 3 + %tmp16 = zext i16 %tmp15 to i32 + tail call void @foo(i32 %tmp10, i32 %tmp12, i32 %tmp14, i32 %tmp16) + ret void +} + +declare void @foo(i32, i32, i32, i32) diff --git a/test/CodeGen/AArch64/eliminate-trunc.ll b/test/CodeGen/AArch64/eliminate-trunc.ll index bc4ac7d71704..83730d15d7f5 100644 --- a/test/CodeGen/AArch64/eliminate-trunc.ll +++ b/test/CodeGen/AArch64/eliminate-trunc.ll @@ -6,7 +6,7 @@ ; CHECK-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1 ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1 ; CHECK-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}} -define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit) minsize { +define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit, i64 %inv) minsize { entry: %conv = zext i8 %limit to i32 %cmp223 = icmp eq i8 %limit, 0 @@ -14,7 +14,7 @@ entry: for.body4.us: %indvars.iv = phi i64 [ 0, %for.body4.lr.ph.us ], [ %indvars.iv.next, %for.body4.us ] - %arrayidx6.us = getelementptr inbounds [8 x i8], [8 x i8]* %a, i64 %indvars.iv26, i64 %indvars.iv + %arrayidx6.us = getelementptr inbounds [8 x i8], [8 x i8]* %a, i64 %indvars.iv, i64 %inv %0 = load i8, i8* %arrayidx6.us, align 1 %idxprom7.us = zext i8 %0 to i64 %arrayidx8.us = getelementptr inbounds i8, i8* %box, i64 %idxprom7.us diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll index 921009cf821d..ac2153ad8ffe 100644 --- a/test/CodeGen/AArch64/extern-weak.ll +++ b/test/CodeGen/AArch64/extern-weak.ll @@ -16,10 +16,10 @@ define i32()* @foo() { ; In the large model, the usual relocations are absolute and can ; materialise 0. -; CHECK-LARGE: movz x0, #:abs_g3:var -; CHECK-LARGE: movk x0, #:abs_g2_nc:var +; CHECK-LARGE: movz x0, #:abs_g0_nc:var ; CHECK-LARGE: movk x0, #:abs_g1_nc:var -; CHECK-LARGE: movk x0, #:abs_g0_nc:var +; CHECK-LARGE: movk x0, #:abs_g2_nc:var +; CHECK-LARGE: movk x0, #:abs_g3:var } @@ -37,10 +37,10 @@ define i32* @bar() { ; In the large model, the usual relocations are absolute and can ; materialise 0. -; CHECK-LARGE: movz [[ADDR:x[0-9]+]], #:abs_g3:arr_var -; CHECK-LARGE: movk [[ADDR]], #:abs_g2_nc:arr_var +; CHECK-LARGE: movz [[ADDR:x[0-9]+]], #:abs_g0_nc:arr_var ; CHECK-LARGE: movk [[ADDR]], #:abs_g1_nc:arr_var -; CHECK-LARGE: movk [[ADDR]], #:abs_g0_nc:arr_var +; CHECK-LARGE: movk [[ADDR]], #:abs_g2_nc:arr_var +; CHECK-LARGE: movk [[ADDR]], #:abs_g3:arr_var } @defined_weak_var = internal unnamed_addr global i32 0 @@ -51,8 +51,8 @@ define i32* @wibble() { ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var ; CHECK: add x0, [[BASE]], :lo12:defined_weak_var -; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var -; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var +; CHECK-LARGE: movz x0, #:abs_g0_nc:defined_weak_var ; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var -; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var +; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var +; CHECK-LARGE: movk x0, #:abs_g3:defined_weak_var } diff --git a/test/CodeGen/AArch64/fast-isel-tail-call.ll b/test/CodeGen/AArch64/fast-isel-tail-call.ll new file mode 100644 index 000000000000..0efaa3734486 --- /dev/null +++ b/test/CodeGen/AArch64/fast-isel-tail-call.ll @@ -0,0 +1,24 @@ +; RUN: llc -fast-isel -pass-remarks-missed=isel -pass-remarks-missed=isel \ +; RUN: -mtriple arm64-- < %s 2> %t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix MISSED + +%struct = type { [4 x i32] } + +declare %struct @external() + +; Check that, when fastisel falls back to SDAG, we don't emit instructions +; that follow a tail-call and would have been dropped by pure SDAGISel. + +; Here, the %struct extractvalue should fail FastISel. + +; MISSED: FastISel missed: %tmp1 = extractvalue %struct %tmp0, 0 + +; CHECK-LABEL: test: +; CHECK: b external +; CHECK-NEXT: .Lfunc_end0: +define i32 @test() nounwind { + %tmp0 = tail call %struct @external() + %tmp1 = extractvalue %struct %tmp0, 0 + %tmp2 = extractvalue [4 x i32] %tmp1, 0 + ret i32 %tmp2 +} diff --git a/test/CodeGen/AArch64/fast-isel-tbz.ll b/test/CodeGen/AArch64/fast-isel-tbz.ll index af817777143d..d6d10318bf02 100644 --- a/test/CodeGen/AArch64/fast-isel-tbz.ll +++ b/test/CodeGen/AArch64/fast-isel-tbz.ll @@ -278,8 +278,24 @@ bb2: ; Test that we don't fold the 'and' instruction into the compare. define i32 @icmp_eq_and_i32(i32 %a, i1 %c) { ; CHECK-LABEL: icmp_eq_and_i32 -; CHECK: and [[REG:w[0-9]+]], w0, #0x4 +; CHECK: and [[REG:w[0-9]+]], w0, #0x3 ; CHECK-NEXT: cbz [[REG]], {{LBB.+_3}} + %1 = and i32 %a, 3 + br i1 %c, label %bb0, label %bb2 +bb0: + %2 = icmp eq i32 %1, 0 + br i1 %2, label %bb1, label %bb2, !prof !0 +bb1: + ret i32 1 +bb2: + ret i32 0 +} + +; Test that we do fold the 'and' instruction into the compare and +; generate a tbz instruction for the conditional branch. +define i32 @icmp_eq_and1bit_i32(i32 %a, i1 %c) { +; CHECK-LABEL: icmp_eq_and1bit_i32 +; CHECK: tbz {{w[0-9]+}}, #2, {{LBB.+_3}} %1 = and i32 %a, 4 br i1 %c, label %bb0, label %bb2 bb0: diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll index b4faef750a2c..d19777c4d27e 100644 --- a/test/CodeGen/AArch64/fpimm.ll +++ b/test/CodeGen/AArch64/fpimm.ll @@ -38,18 +38,18 @@ define void @check_double() { } ; LARGE-LABEL: check_float2 -; LARGE: mov [[REG:w[0-9]+]], #1078525952 -; LARGE-NEXT: movk [[REG]], #4059 +; LARGE: mov [[REG:w[0-9]+]], #4059 +; LARGE-NEXT: movk [[REG]], #16457, lsl #16 ; LARGE-NEXT: fmov s0, [[REG]] define float @check_float2() { ret float 3.14159274101257324218750 } ; LARGE-LABEL: check_double2 -; LARGE: mov [[REG:x[0-9]+]], #4614219293217783808 -; LARGE-NEXT: movk [[REG]], #8699, lsl #32 +; LARGE: mov [[REG:x[0-9]+]], #11544 ; LARGE-NEXT: movk [[REG]], #21572, lsl #16 -; LARGE-NEXT: movk [[REG]], #11544 +; LARGE-NEXT: movk [[REG]], #8699, lsl #32 +; LARGE-NEXT: movk [[REG]], #16393, lsl #48 ; LARGE-NEXT: fmov d0, [[REG]] define double @check_double2() { ret double 3.1415926535897931159979634685441851615905761718750 diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll index d6a7fceac84d..f71d4356be35 100644 --- a/test/CodeGen/AArch64/jump-table.ll +++ b/test/CodeGen/AArch64/jump-table.ll @@ -16,10 +16,10 @@ define i32 @test_jumptable(i32 %in) { ; CHECK: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3] ; CHECK: br [[DEST]] -; CHECK-LARGE: movz x[[JTADDR:[0-9]+]], #:abs_g3:.LJTI0_0 -; CHECK-LARGE: movk x[[JTADDR]], #:abs_g2_nc:.LJTI0_0 +; CHECK-LARGE: movz x[[JTADDR:[0-9]+]], #:abs_g0_nc:.LJTI0_0 ; CHECK-LARGE: movk x[[JTADDR]], #:abs_g1_nc:.LJTI0_0 -; CHECK-LARGE: movk x[[JTADDR]], #:abs_g0_nc:.LJTI0_0 +; CHECK-LARGE: movk x[[JTADDR]], #:abs_g2_nc:.LJTI0_0 +; CHECK-LARGE: movk x[[JTADDR]], #:abs_g3:.LJTI0_0 ; CHECK-LARGE: ldr [[DEST:x[0-9]+]], [x[[JTADDR]], {{x[0-9]+}}, lsl #3] ; CHECK-LARGE: br [[DEST]] diff --git a/test/CodeGen/AArch64/large-consts.ll b/test/CodeGen/AArch64/large-consts.ll index 6bf85e829f61..e351c3530696 100644 --- a/test/CodeGen/AArch64/large-consts.ll +++ b/test/CodeGen/AArch64/large-consts.ll @@ -5,10 +5,10 @@ define double @foo() { -; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0 // encoding: [0bAAA01000,A,0b111AAAAA,0xd2] -; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2] +; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xd2] ; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2] -; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2] +; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2] +; CHECK: movk [[CPADDR]], #:abs_g3:.LCPI0_0 // encoding: [0bAAA01000,A,0b111AAAAA,0xf2] ret double 3.14159 } diff --git a/test/CodeGen/AArch64/ldst-opt-aa.mir b/test/CodeGen/AArch64/ldst-opt-aa.mir new file mode 100644 index 000000000000..808926ae3cd1 --- /dev/null +++ b/test/CodeGen/AArch64/ldst-opt-aa.mir @@ -0,0 +1,30 @@ +# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=aarch64-ldst-opt %s -verify-machineinstrs -o - | FileCheck %s +--- | + define void @ldr_str_aa(i32* noalias nocapture %x, i32* noalias nocapture readonly %y) { + entry: + %0 = load i32, i32* %y, align 4 + store i32 %0, i32* %x, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %y, i32 1 + %1 = load i32, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %x, i32 1 + store i32 %1, i32* %arrayidx3, align 4 + ret void + } + +... +--- +# CHECK-LABEL: name: ldr_str_aa +# CHECK: %w8, %w9 = LDPWi %x1, 0 +# CHECK: STPWi %w8, %w9, %x0, 0 +name: ldr_str_aa +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: %x0, %x1 + + %w8 = LDRWui %x1, 0 :: (load 4 from %ir.y) + STRWui killed %w8, %x0, 0 :: (store 4 into %ir.x) + %w9 = LDRWui killed %x1, 1 :: (load 4 from %ir.arrayidx2) + STRWui killed %w9, killed %x0, 1 :: (store 4 into %ir.arrayidx3) + RET undef %lr + diff --git a/test/CodeGen/AArch64/ldst-opt.mir b/test/CodeGen/AArch64/ldst-opt.mir index 85b655b717ca..f7641d3ffd04 100644 --- a/test/CodeGen/AArch64/ldst-opt.mir +++ b/test/CodeGen/AArch64/ldst-opt.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=aarch64-ldst-opt %s -verify-machineinstrs -o - 2>&1 | FileCheck %s +# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=aarch64-ldst-opt %s -verify-machineinstrs -o - | FileCheck %s --- name: promote-load-from-store tracksRegLiveness: true diff --git a/test/CodeGen/AArch64/literal_pools_float.ll b/test/CodeGen/AArch64/literal_pools_float.ll index f5d6a17f3a11..6ad685ad7c49 100644 --- a/test/CodeGen/AArch64/literal_pools_float.ll +++ b/test/CodeGen/AArch64/literal_pools_float.ll @@ -15,10 +15,10 @@ define void @floating_lits() { ; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]] ; CHECK-NOFP-NOT: ldr {{s[0-9]+}}, -; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI[0-9]+_[0-9]+]] -; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]] +; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g0_nc:[[CURLIT:.LCPI[0-9]+_[0-9]+]] ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]] -; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]] +; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]] +; CHECK-LARGE: movk x[[LITADDR]], #:abs_g3:[[CURLIT]] ; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]] ; CHECK-LARGE: fadd ; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}}, @@ -33,10 +33,10 @@ define void @floating_lits() { ; CHECK-NOFP-NOT: ldr {{d[0-9]+}}, ; CHECK-NOFP-NOT: fadd -; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI[0-9]+_[0-9]+]] -; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]] +; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g0_nc:[[CURLIT:.LCPI[0-9]+_[0-9]+]] ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]] -; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]] +; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]] +; CHECK-LARGE: movk x[[LITADDR]], #:abs_g3:[[CURLIT]] ; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]] ; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}}, diff --git a/test/CodeGen/AArch64/live-interval-analysis.mir b/test/CodeGen/AArch64/live-interval-analysis.mir new file mode 100644 index 000000000000..d44300973566 --- /dev/null +++ b/test/CodeGen/AArch64/live-interval-analysis.mir @@ -0,0 +1,22 @@ +# RUN: llc -o /dev/null %s -mtriple=aarch64-darwin-ios -run-pass=liveintervals -debug-only=regalloc -precompute-phys-liveness 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @reserved_reg_liveness() { ret void } +... +--- +# CHECK-LABEL: ********** INTERVALS ********** +# W29 is reserved, so we should only see dead defs +# CHECK-DAG: W29 [0B,0d:{{[0-9]+}})[32r,32d:{{[0-9]+}})[64r,64d:{{[0-9]+}}) +# For normal registers like x28 we should see the full intervals +# CHECK-DAG: W28 [0B,16r:{{[0-9]+}})[32r,48r:{{[0-9]+}})[48r,48d:{{[0-9]+}}) +# CHECK: # End machine code for function reserved_reg_liveness. +name: reserved_reg_liveness +tracksRegLiveness: true +body: | + bb.0: + liveins: %x28_fp + %6 : xseqpairsclass = COPY %x28_fp + %x28_fp = COPY %6 + %x28 = COPY %x28 + %fp = COPY %fp +... diff --git a/test/CodeGen/AArch64/load-combine-big-endian.ll b/test/CodeGen/AArch64/load-combine-big-endian.ll new file mode 100644 index 000000000000..918ceaeb1b4f --- /dev/null +++ b/test/CodeGen/AArch64/load-combine-big-endian.ll @@ -0,0 +1,584 @@ +; RUN: llc < %s -mtriple=arm64eb-unknown | FileCheck %s + +; i8* p; // p is 4 byte aligned +; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_big_endian(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_big_endian: +; CHECK: ldr w0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 4 + %tmp2 = zext i8 %tmp1 to i32 + %tmp3 = shl nuw nsw i32 %tmp2, 24 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 8 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = or i32 %tmp13, %tmp16 + ret i32 %tmp17 +} + +; i8* p; // p is 4 byte aligned +; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4]) +define i32 @load_i32_by_i16_by_i8_big_endian(i32* %arg) { +; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian: +; CHECK: ldr w0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 4 + %tmp2 = zext i8 %tmp1 to i16 + %tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = zext i8 %tmp4 to i16 + %tmp6 = shl nuw nsw i16 %tmp2, 8 + %tmp7 = or i16 %tmp6, %tmp5 + %tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp9 = load i8, i8* %tmp8, align 1 + %tmp10 = zext i8 %tmp9 to i16 + %tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp12 = load i8, i8* %tmp11, align 1 + %tmp13 = zext i8 %tmp12 to i16 + %tmp14 = shl nuw nsw i16 %tmp10, 8 + %tmp15 = or i16 %tmp14, %tmp13 + %tmp16 = zext i16 %tmp7 to i32 + %tmp17 = zext i16 %tmp15 to i32 + %tmp18 = shl nuw nsw i32 %tmp16, 16 + %tmp19 = or i32 %tmp18, %tmp17 + ret i32 %tmp19 +} + +; i16* p; // p is 4 byte aligned +; ((i32) p[0] << 16) | (i32) p[1] +define i32 @load_i32_by_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_i16: +; CHECK: ldr w0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp2 = zext i16 %tmp1 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp5 = zext i16 %tmp4 to i32 + %tmp6 = shl nuw nsw i32 %tmp2, 16 + %tmp7 = or i32 %tmp6, %tmp5 + ret i32 %tmp7 +} + +; i16* p_16; // p_16 is 4 byte aligned +; i8* p_8 = (i8*) p_16; +; (i32) (p_16[0] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i16_i8(i32* %arg) { +; CHECK-LABEL: load_i32_by_i16_i8: +; CHECK: ldr w0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i16* + %tmp1 = bitcast i32* %arg to i8* + %tmp2 = load i16, i16* %tmp, align 4 + %tmp3 = zext i16 %tmp2 to i32 + %tmp4 = shl nuw nsw i32 %tmp3, 16 + %tmp5 = getelementptr inbounds i8, i8* %tmp1, i32 2 + %tmp6 = load i8, i8* %tmp5, align 1 + %tmp7 = zext i8 %tmp6 to i32 + %tmp8 = shl nuw nsw i32 %tmp7, 8 + %tmp9 = getelementptr inbounds i8, i8* %tmp1, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = or i32 %tmp8, %tmp11 + %tmp13 = or i32 %tmp12, %tmp4 + ret i32 %tmp13 +} + +; i8* p; // p is 8 byte aligned +; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) +define i64 @load_i64_by_i8_bswap(i64* %arg) { +; CHECK-LABEL: load_i64_by_i8_bswap: +; CHECK: ldr x8, [x0] +; CHECK-NEXT: rev x0, x8 +; CHECK-NEXT: ret + %tmp = bitcast i64* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 8 + %tmp2 = zext i8 %tmp1 to i64 + %tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1 + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = zext i8 %tmp4 to i64 + %tmp6 = shl nuw nsw i64 %tmp5, 8 + %tmp7 = or i64 %tmp6, %tmp2 + %tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2 + %tmp9 = load i8, i8* %tmp8, align 1 + %tmp10 = zext i8 %tmp9 to i64 + %tmp11 = shl nuw nsw i64 %tmp10, 16 + %tmp12 = or i64 %tmp7, %tmp11 + %tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3 + %tmp14 = load i8, i8* %tmp13, align 1 + %tmp15 = zext i8 %tmp14 to i64 + %tmp16 = shl nuw nsw i64 %tmp15, 24 + %tmp17 = or i64 %tmp12, %tmp16 + %tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4 + %tmp19 = load i8, i8* %tmp18, align 1 + %tmp20 = zext i8 %tmp19 to i64 + %tmp21 = shl nuw nsw i64 %tmp20, 32 + %tmp22 = or i64 %tmp17, %tmp21 + %tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5 + %tmp24 = load i8, i8* %tmp23, align 1 + %tmp25 = zext i8 %tmp24 to i64 + %tmp26 = shl nuw nsw i64 %tmp25, 40 + %tmp27 = or i64 %tmp22, %tmp26 + %tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6 + %tmp29 = load i8, i8* %tmp28, align 1 + %tmp30 = zext i8 %tmp29 to i64 + %tmp31 = shl nuw nsw i64 %tmp30, 48 + %tmp32 = or i64 %tmp27, %tmp31 + %tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7 + %tmp34 = load i8, i8* %tmp33, align 1 + %tmp35 = zext i8 %tmp34 to i64 + %tmp36 = shl nuw i64 %tmp35, 56 + %tmp37 = or i64 %tmp32, %tmp36 + ret i64 %tmp37 +} + +; i8* p; // p is 8 byte aligned +; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] +define i64 @load_i64_by_i8(i64* %arg) { +; CHECK-LABEL: load_i64_by_i8: +; CHECK: ldr x0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i64* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 8 + %tmp2 = zext i8 %tmp1 to i64 + %tmp3 = shl nuw i64 %tmp2, 56 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i64 + %tmp7 = shl nuw nsw i64 %tmp6, 48 + %tmp8 = or i64 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i64 + %tmp12 = shl nuw nsw i64 %tmp11, 40 + %tmp13 = or i64 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i64 + %tmp17 = shl nuw nsw i64 %tmp16, 32 + %tmp18 = or i64 %tmp13, %tmp17 + %tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4 + %tmp20 = load i8, i8* %tmp19, align 1 + %tmp21 = zext i8 %tmp20 to i64 + %tmp22 = shl nuw nsw i64 %tmp21, 24 + %tmp23 = or i64 %tmp18, %tmp22 + %tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5 + %tmp25 = load i8, i8* %tmp24, align 1 + %tmp26 = zext i8 %tmp25 to i64 + %tmp27 = shl nuw nsw i64 %tmp26, 16 + %tmp28 = or i64 %tmp23, %tmp27 + %tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6 + %tmp30 = load i8, i8* %tmp29, align 1 + %tmp31 = zext i8 %tmp30 to i64 + %tmp32 = shl nuw nsw i64 %tmp31, 8 + %tmp33 = or i64 %tmp28, %tmp32 + %tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7 + %tmp35 = load i8, i8* %tmp34, align 1 + %tmp36 = zext i8 %tmp35 to i64 + %tmp37 = or i64 %tmp33, %tmp36 + ret i64 %tmp37 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldur w8, [x0, #1] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldur w8, [x0, #-4] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldur w0, [x0, #1] +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldur w0, [x0, #-4] +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +declare i16 @llvm.bswap.i16(i16) + +; i16* p; // p is 4 byte aligned +; (i32) bswap(p[0]) | (i32) bswap(p[1] << 16) +define i32 @load_i32_by_bswap_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_bswap_i16: +; CHECK: ldr w8, [x0] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1) + %tmp2 = zext i16 %tmp11 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp41 = call i16 @llvm.bswap.i16(i16 %tmp4) + %tmp5 = zext i16 %tmp41 to i32 + %tmp6 = shl nuw nsw i32 %tmp5, 16 + %tmp7 = or i32 %tmp6, %tmp2 + ret i32 %tmp7 +} + +; i16* p; // p is 4 byte aligned +; (i32) p[1] | (sext(p[0] << 16) to i32) +define i32 @load_i32_by_sext_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_sext_i16: +; CHECK: ldr w0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp2 = sext i16 %tmp1 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp5 = zext i16 %tmp4 to i32 + %tmp6 = shl nuw nsw i32 %tmp2, 16 + %tmp7 = or i32 %tmp6, %tmp5 + ret i32 %tmp7 +} + +; i8* arg; i32 i; +; p = arg + 12; +; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24) +define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { +; CHECK-LABEL: load_i32_by_i8_base_offset_index: +; CHECK: add x8, x0, w1, uxtw +; CHECK-NEXT: ldr w8, [x8, #12] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + %tmp = add nuw nsw i32 %i, 3 + %tmp2 = add nuw nsw i32 %i, 2 + %tmp3 = add nuw nsw i32 %i, 1 + %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12 + %tmp5 = zext i32 %i to i64 + %tmp6 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp5 + %tmp7 = load i8, i8* %tmp6, align 4 + %tmp8 = zext i8 %tmp7 to i32 + %tmp9 = zext i32 %tmp3 to i64 + %tmp10 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp9 + %tmp11 = load i8, i8* %tmp10, align 1 + %tmp12 = zext i8 %tmp11 to i32 + %tmp13 = shl nuw nsw i32 %tmp12, 8 + %tmp14 = or i32 %tmp13, %tmp8 + %tmp15 = zext i32 %tmp2 to i64 + %tmp16 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp15 + %tmp17 = load i8, i8* %tmp16, align 1 + %tmp18 = zext i8 %tmp17 to i32 + %tmp19 = shl nuw nsw i32 %tmp18, 16 + %tmp20 = or i32 %tmp14, %tmp19 + %tmp21 = zext i32 %tmp to i64 + %tmp22 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp21 + %tmp23 = load i8, i8* %tmp22, align 1 + %tmp24 = zext i8 %tmp23 to i32 + %tmp25 = shl nuw i32 %tmp24, 24 + %tmp26 = or i32 %tmp20, %tmp25 + ret i32 %tmp26 +} + +; i8* arg; i32 i; +; p = arg + 12; +; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) +define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { +; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: +; CHECK: add x8, x0, w1, uxtw +; CHECK-NEXT: ldur w8, [x8, #13] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + %tmp = add nuw nsw i32 %i, 4 + %tmp2 = add nuw nsw i32 %i, 3 + %tmp3 = add nuw nsw i32 %i, 2 + %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12 + %tmp5 = add nuw nsw i32 %i, 1 + %tmp27 = zext i32 %tmp5 to i64 + %tmp28 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp27 + %tmp29 = load i8, i8* %tmp28, align 4 + %tmp30 = zext i8 %tmp29 to i32 + %tmp31 = zext i32 %tmp3 to i64 + %tmp32 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp31 + %tmp33 = load i8, i8* %tmp32, align 1 + %tmp34 = zext i8 %tmp33 to i32 + %tmp35 = shl nuw nsw i32 %tmp34, 8 + %tmp36 = or i32 %tmp35, %tmp30 + %tmp37 = zext i32 %tmp2 to i64 + %tmp38 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp37 + %tmp39 = load i8, i8* %tmp38, align 1 + %tmp40 = zext i8 %tmp39 to i32 + %tmp41 = shl nuw nsw i32 %tmp40, 16 + %tmp42 = or i32 %tmp36, %tmp41 + %tmp43 = zext i32 %tmp to i64 + %tmp44 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp43 + %tmp45 = load i8, i8* %tmp44, align 1 + %tmp46 = zext i8 %tmp45 to i32 + %tmp47 = shl nuw i32 %tmp46, 24 + %tmp48 = or i32 %tmp42, %tmp47 + ret i32 %tmp48 +} +; i8* p; // p is 2 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) +define i32 @zext_load_i32_by_i8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8: +; CHECK: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[0] << 8) | ((i32) p[1] << 16) +define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_shl_8: +; CHECK: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: lsl w0, w8, #8 +; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 8 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[0] << 16) | ((i32) p[1] << 24) +define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_shl_16: +; CHECK: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: lsl w0, w8, #16 +; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 16 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 24 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} +; i8* p; // p is 2 byte aligned +; (i32) p[1] | ((i32) p[0] << 8) +define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap: +; CHECK: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[1] << 8) | ((i32) p[0] << 16) +define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8: +; CHECK: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: lsl w0, w8, #8 +; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 8 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[1] << 16) | ((i32) p[0] << 24) +define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16: +; CHECK: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: lsl w0, w8, #16 +; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 16 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 24 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; +; i16* p1.i16 = (i16*) p; +; (p1.i16[0] << 8) | ((i16) p[2]) +; +; This is essentialy a i16 load from p[1], but we don't fold the pattern now +; because in the original DAG we don't have p[1] address available +define i16 @load_i16_from_nonzero_offset(i8* %p) { +; CHECK-LABEL: load_i16_from_nonzero_offset: +; CHECK: ldrh w8, [x0] +; CHECK-NEXT: ldrb w0, [x0, #2] +; CHECK-NEXT: bfi w0, w8, #8, #24 +; CHECK-NEXT: ret + + %p1.i16 = bitcast i8* %p to i16* + %p2.i8 = getelementptr i8, i8* %p, i64 2 + %v1 = load i16, i16* %p1.i16 + %v2.i8 = load i8, i8* %p2.i8 + %v2 = zext i8 %v2.i8 to i16 + %v1.shl = shl i16 %v1, 8 + %res = or i16 %v1.shl, %v2 + ret i16 %res +} diff --git a/test/CodeGen/AArch64/load-combine.ll b/test/CodeGen/AArch64/load-combine.ll new file mode 100644 index 000000000000..f0ed40357f12 --- /dev/null +++ b/test/CodeGen/AArch64/load-combine.ll @@ -0,0 +1,548 @@ +; RUN: llc < %s -mtriple=arm64-unknown | FileCheck %s + +; i8* p; // p is 1 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i8_unaligned(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_unaligned: +; CHECK: ldr w0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p is 4 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i8_aligned(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_aligned: +; CHECK: ldr w0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p is 4 byte aligned +; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_bswap: +; CHECK: ldr w8, [x0] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 4 + %tmp2 = zext i8 %tmp1 to i32 + %tmp3 = shl nuw nsw i32 %tmp2, 24 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 8 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = or i32 %tmp13, %tmp16 + ret i32 %tmp17 +} + +; i8* p; // p is 8 byte aligned +; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) +define i64 @load_i64_by_i8(i64* %arg) { +; CHECK-LABEL: load_i64_by_i8: +; CHECK: ldr x0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i64* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 8 + %tmp2 = zext i8 %tmp1 to i64 + %tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1 + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = zext i8 %tmp4 to i64 + %tmp6 = shl nuw nsw i64 %tmp5, 8 + %tmp7 = or i64 %tmp6, %tmp2 + %tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2 + %tmp9 = load i8, i8* %tmp8, align 1 + %tmp10 = zext i8 %tmp9 to i64 + %tmp11 = shl nuw nsw i64 %tmp10, 16 + %tmp12 = or i64 %tmp7, %tmp11 + %tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3 + %tmp14 = load i8, i8* %tmp13, align 1 + %tmp15 = zext i8 %tmp14 to i64 + %tmp16 = shl nuw nsw i64 %tmp15, 24 + %tmp17 = or i64 %tmp12, %tmp16 + %tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4 + %tmp19 = load i8, i8* %tmp18, align 1 + %tmp20 = zext i8 %tmp19 to i64 + %tmp21 = shl nuw nsw i64 %tmp20, 32 + %tmp22 = or i64 %tmp17, %tmp21 + %tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5 + %tmp24 = load i8, i8* %tmp23, align 1 + %tmp25 = zext i8 %tmp24 to i64 + %tmp26 = shl nuw nsw i64 %tmp25, 40 + %tmp27 = or i64 %tmp22, %tmp26 + %tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6 + %tmp29 = load i8, i8* %tmp28, align 1 + %tmp30 = zext i8 %tmp29 to i64 + %tmp31 = shl nuw nsw i64 %tmp30, 48 + %tmp32 = or i64 %tmp27, %tmp31 + %tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7 + %tmp34 = load i8, i8* %tmp33, align 1 + %tmp35 = zext i8 %tmp34 to i64 + %tmp36 = shl nuw i64 %tmp35, 56 + %tmp37 = or i64 %tmp32, %tmp36 + ret i64 %tmp37 +} + +; i8* p; // p is 8 byte aligned +; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] +define i64 @load_i64_by_i8_bswap(i64* %arg) { +; CHECK-LABEL: load_i64_by_i8_bswap: +; CHECK: ldr x8, [x0] +; CHECK-NEXT: rev x0, x8 +; CHECK-NEXT: ret + %tmp = bitcast i64* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 8 + %tmp2 = zext i8 %tmp1 to i64 + %tmp3 = shl nuw i64 %tmp2, 56 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i64 + %tmp7 = shl nuw nsw i64 %tmp6, 48 + %tmp8 = or i64 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i64 + %tmp12 = shl nuw nsw i64 %tmp11, 40 + %tmp13 = or i64 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i64 + %tmp17 = shl nuw nsw i64 %tmp16, 32 + %tmp18 = or i64 %tmp13, %tmp17 + %tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4 + %tmp20 = load i8, i8* %tmp19, align 1 + %tmp21 = zext i8 %tmp20 to i64 + %tmp22 = shl nuw nsw i64 %tmp21, 24 + %tmp23 = or i64 %tmp18, %tmp22 + %tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5 + %tmp25 = load i8, i8* %tmp24, align 1 + %tmp26 = zext i8 %tmp25 to i64 + %tmp27 = shl nuw nsw i64 %tmp26, 16 + %tmp28 = or i64 %tmp23, %tmp27 + %tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6 + %tmp30 = load i8, i8* %tmp29, align 1 + %tmp31 = zext i8 %tmp30 to i64 + %tmp32 = shl nuw nsw i64 %tmp31, 8 + %tmp33 = or i64 %tmp28, %tmp32 + %tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7 + %tmp35 = load i8, i8* %tmp34, align 1 + %tmp36 = zext i8 %tmp35 to i64 + %tmp37 = or i64 %tmp33, %tmp36 + ret i64 %tmp37 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldur w0, [x0, #1] +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldur w0, [x0, #-4] +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldur w8, [x0, #1] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldur w8, [x0, #-4] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +declare i16 @llvm.bswap.i16(i16) + +; i16* p; // p is 4 byte aligned +; (i32) bswap(p[1]) | (i32) bswap(p[0] << 16) +define i32 @load_i32_by_bswap_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_bswap_i16: +; CHECK: ldr w8, [x0] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1) + %tmp2 = zext i16 %tmp11 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp41 = call i16 @llvm.bswap.i16(i16 %tmp4) + %tmp5 = zext i16 %tmp41 to i32 + %tmp6 = shl nuw nsw i32 %tmp2, 16 + %tmp7 = or i32 %tmp6, %tmp5 + ret i32 %tmp7 +} + +; i16* p; // p is 4 byte aligned +; (i32) p[0] | (sext(p[1] << 16) to i32) +define i32 @load_i32_by_sext_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_sext_i16: +; CHECK: ldr w0, [x0] +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp2 = zext i16 %tmp1 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp5 = sext i16 %tmp4 to i32 + %tmp6 = shl nuw nsw i32 %tmp5, 16 + %tmp7 = or i32 %tmp6, %tmp2 + ret i32 %tmp7 +} + +; i8* arg; i32 i; +; p = arg + 12; +; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24) +define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { +; CHECK-LABEL: load_i32_by_i8_base_offset_index: +; CHECK: add x8, x0, w1, uxtw +; CHECK-NEXT: ldr w0, [x8, #12] +; CHECK-NEXT: ret + %tmp = add nuw nsw i32 %i, 3 + %tmp2 = add nuw nsw i32 %i, 2 + %tmp3 = add nuw nsw i32 %i, 1 + %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12 + %tmp5 = zext i32 %i to i64 + %tmp6 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp5 + %tmp7 = load i8, i8* %tmp6, align 4 + %tmp8 = zext i8 %tmp7 to i32 + %tmp9 = zext i32 %tmp3 to i64 + %tmp10 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp9 + %tmp11 = load i8, i8* %tmp10, align 1 + %tmp12 = zext i8 %tmp11 to i32 + %tmp13 = shl nuw nsw i32 %tmp12, 8 + %tmp14 = or i32 %tmp13, %tmp8 + %tmp15 = zext i32 %tmp2 to i64 + %tmp16 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp15 + %tmp17 = load i8, i8* %tmp16, align 1 + %tmp18 = zext i8 %tmp17 to i32 + %tmp19 = shl nuw nsw i32 %tmp18, 16 + %tmp20 = or i32 %tmp14, %tmp19 + %tmp21 = zext i32 %tmp to i64 + %tmp22 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp21 + %tmp23 = load i8, i8* %tmp22, align 1 + %tmp24 = zext i8 %tmp23 to i32 + %tmp25 = shl nuw i32 %tmp24, 24 + %tmp26 = or i32 %tmp20, %tmp25 + ret i32 %tmp26 +} + +; i8* arg; i32 i; +; p = arg + 12; +; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) +define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { +; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: +; CHECK: add x8, x0, w1, uxtw +; CHECK-NEXT: ldur w0, [x8, #13] +; CHECK-NEXT: ret + %tmp = add nuw nsw i32 %i, 4 + %tmp2 = add nuw nsw i32 %i, 3 + %tmp3 = add nuw nsw i32 %i, 2 + %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12 + %tmp5 = add nuw nsw i32 %i, 1 + %tmp27 = zext i32 %tmp5 to i64 + %tmp28 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp27 + %tmp29 = load i8, i8* %tmp28, align 4 + %tmp30 = zext i8 %tmp29 to i32 + %tmp31 = zext i32 %tmp3 to i64 + %tmp32 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp31 + %tmp33 = load i8, i8* %tmp32, align 1 + %tmp34 = zext i8 %tmp33 to i32 + %tmp35 = shl nuw nsw i32 %tmp34, 8 + %tmp36 = or i32 %tmp35, %tmp30 + %tmp37 = zext i32 %tmp2 to i64 + %tmp38 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp37 + %tmp39 = load i8, i8* %tmp38, align 1 + %tmp40 = zext i8 %tmp39 to i32 + %tmp41 = shl nuw nsw i32 %tmp40, 16 + %tmp42 = or i32 %tmp36, %tmp41 + %tmp43 = zext i32 %tmp to i64 + %tmp44 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp43 + %tmp45 = load i8, i8* %tmp44, align 1 + %tmp46 = zext i8 %tmp45 to i32 + %tmp47 = shl nuw i32 %tmp46, 24 + %tmp48 = or i32 %tmp42, %tmp47 + ret i32 %tmp48 +} + +; i8* p; // p is 2 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) +define i32 @zext_load_i32_by_i8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8: +; CHECK: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[0] << 8) | ((i32) p[1] << 16) +define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_shl_8: +; CHECK: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: lsl w0, w8, #8 +; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 8 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[0] << 16) | ((i32) p[1] << 24) +define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_shl_16: +; CHECK: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: lsl w0, w8, #16 +; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 16 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 24 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} +; i8* p; // p is 2 byte aligned +; (i32) p[1] | ((i32) p[0] << 8) +define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap: +; CHECK: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[1] << 8) | ((i32) p[0] << 16) +define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8: +; CHECK: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: lsl w0, w8, #8 +; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 8 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[1] << 16) | ((i32) p[0] << 24) +define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16: +; CHECK: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: lsl w0, w8, #16 +; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 16 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 24 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} diff --git a/test/CodeGen/AArch64/machine-combiner-madd.ll b/test/CodeGen/AArch64/machine-combiner-madd.ll index ea3113789461..4efe4e9cfb01 100644 --- a/test/CodeGen/AArch64/machine-combiner-madd.ll +++ b/test/CodeGen/AArch64/machine-combiner-madd.ll @@ -6,7 +6,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m2 < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=vulcan < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s | FileCheck %s ; Make sure that inst-combine fuses the multiply add in the addressing mode of ; the load. diff --git a/test/CodeGen/AArch64/machine-copy-remove.mir b/test/CodeGen/AArch64/machine-copy-remove.mir new file mode 100644 index 000000000000..6f2d3a3009b0 --- /dev/null +++ b/test/CodeGen/AArch64/machine-copy-remove.mir @@ -0,0 +1,672 @@ +# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=aarch64-copyelim %s -verify-machineinstrs -o - | FileCheck %s +--- +# Check that bb.0 COPY is seen through to allow the bb.1 COPY of XZR to be removed. +# CHECK-LABEL: name: test1 +# CHECK-NOT: COPY %xzr +name: test1 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1 + + %x0 = COPY %x1 + CBNZX %x1, %bb.2 + + bb.1: + successors: %bb.3 + + %x0 = COPY %xzr + B %bb.3 + + bb.2: + successors: %bb.3 + liveins: %x1 + + %x0 = LDRXui %x1, 0 + + bb.3: + liveins: %x0 + + RET_ReallyLR implicit %x0 + +... +# Similar to test1, but with reversed COPY. +# CHECK-LABEL: name: test2 +# CHECK-NOT: COPY %xzr +name: test2 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1 + + %x1 = COPY %x0 + CBNZX %x1, %bb.2 + + bb.1: + successors: %bb.3 + + %x0 = COPY %xzr + B %bb.3 + + bb.2: + successors: %bb.3 + liveins: %x1 + + %x0 = LDRXui %x1, 0 + + bb.3: + liveins: %x0 + + RET_ReallyLR implicit %x0 + +... +# Similar to test1, but with a clobber that prevents removal of the XZR COPY. +# CHECK-LABEL: name: test3 +# CHECK: COPY %xzr +name: test3 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1, %x2 + + %x0 = COPY %x1 + %x1 = LDRXui %x1, 0 + CBNZX %x1, %bb.2 + + bb.1: + successors: %bb.3 + + %x0 = COPY %xzr + B %bb.3 + + bb.2: + successors: %bb.3 + liveins: %x1 + + %x0 = LDRXui %x1, 0 + + bb.3: + liveins: %x0 + + RET_ReallyLR implicit %x0 + +... +# Similar to test2, but with a clobber that prevents removal of the XZR COPY. +# CHECK-LABEL: name: test4 +# CHECK: COPY %xzr +name: test4 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1, %x2 + + %x1 = COPY %x0 + %x1 = LDRXui %x1, 0 + CBNZX %x1, %bb.2 + + bb.1: + successors: %bb.3 + + %x0 = COPY %xzr + B %bb.3 + + bb.2: + successors: %bb.3 + liveins: %x1 + + %x0 = LDRXui %x1, 0 + + bb.3: + liveins: %x0 + + RET_ReallyLR implicit %x0 + +... +# Similar to test2, but with a clobber that prevents removal of the XZR COPY. +# CHECK-LABEL: name: test5 +# CHECK: COPY %xzr +name: test5 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1, %x2 + + %x1 = COPY %x0 + %x0 = LDRXui %x1, 0 + CBNZX %x1, %bb.2 + + bb.1: + successors: %bb.3 + + %x0 = COPY %xzr + B %bb.3 + + bb.2: + successors: %bb.3 + liveins: %x1 + + %x0 = LDRXui %x1, 0 + + bb.3: + liveins: %x0 + + RET_ReallyLR implicit %x0 + +... +# Similar to test1, but with two levels of COPYs. +# CHECK-LABEL: name: test6 +# CHECK-NOT: COPY %xzr +name: test6 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1, %x2 + + %x2 = COPY %x0 + %x1 = COPY %x2 + CBNZX %x1, %bb.2 + + bb.1: + successors: %bb.3 + + %x0 = COPY %xzr + B %bb.3 + + bb.2: + successors: %bb.3 + liveins: %x1 + + %x0 = LDRXui %x1, 0 + + bb.3: + liveins: %x0 + + RET_ReallyLR implicit %x0 + +... +# Similar to test1, but with two levels of COPYs and a clobber preventing COPY of XZR removal. +# CHECK-LABEL: name: test7 +# CHECK: COPY %xzr +name: test7 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1, %x2 + + %x2 = COPY %x0 + %x0 = LDRXui %x1, 0 + %x1 = COPY %x2 + CBNZX %x1, %bb.2 + + bb.1: + successors: %bb.3 + + %x0 = COPY %xzr + B %bb.3 + + bb.2: + successors: %bb.3 + liveins: %x1 + + %x0 = LDRXui %x1, 0 + + bb.3: + liveins: %x0 + + RET_ReallyLR implicit %x0 + +... +# Check that the TargetRegs vector clobber update loop in +# AArch64RedundantCopyElimination::optimizeCopy works correctly. +# CHECK-LABEL: name: test8 +# CHECK: x0 = COPY %xzr +# CHECK: x1 = COPY %xzr +name: test8 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1 + + %x1 = COPY %x0 + CBNZX %x1, %bb.2 + + bb.1: + successors: %bb.3 + liveins: %x0, %x2 + + %x0, %x1 = LDPXi %x2, 0 + %x0 = COPY %xzr + %x1 = COPY %xzr + B %bb.3 + + bb.2: + successors: %bb.3 + liveins: %x1 + + %x0 = LDRXui %x1, 0 + + bb.3: + liveins: %x0 + + RET_ReallyLR implicit %x0 + +... +# Check that copy isn't removed from a block with multiple predecessors. +# CHECK-LABEL: name: test9 +# CHECK: x0 = COPY %xzr +# CHECK-NEXT: B %bb.3 +name: test9 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1 + + CBNZX %x0, %bb.2 + + bb.1: + successors: %bb.3 + liveins: %x0, %x2 + + %x0 = COPY %xzr + B %bb.3 + + bb.2: + successors: %bb.1, %bb.3 + liveins: %x1 + + %x0 = LDRXui %x1, 0 + + CBNZX %x1, %bb.1 + + bb.3: + liveins: %x0 + + RET_ReallyLR implicit %x0 + +... +# Eliminate redundant MOVi32imm 7 in bb.1 +# Note: 32-bit compare/32-bit move imm +# Kill marker should be removed from compare. +# CHECK-LABEL: name: test10 +# CHECK: SUBSWri %w0, 7, 0, implicit-def %nzcv +# CHECK: bb.1: +# CHECK-NOT: MOVi32imm +name: test10 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %w0, %x1 + + dead %wzr = SUBSWri killed %w0, 7, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w0 = MOVi32imm 7 + STRWui killed %w0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Eliminate redundant MOVi32imm 7 in bb.1 +# Note: 64-bit compare/32-bit move imm w/implicit def +# Kill marker should be removed from compare. +# CHECK-LABEL: name: test11 +# CHECK: SUBSXri %x0, 7, 0, implicit-def %nzcv +# CHECK: bb.1: +# CHECK-NOT: MOVi32imm +name: test11 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %x0, %x1 + + dead %xzr = SUBSXri killed %x0, 7, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w0 = MOVi32imm 7, implicit-def %x0 + STRXui killed %x0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Eliminate redundant MOVi32imm 7 in bb.1 +# Note: 64-bit compare/32-bit move imm +# Kill marker should be removed from compare. +# CHECK-LABEL: name: test12 +# CHECK: SUBSXri %x0, 7, 0, implicit-def %nzcv +# CHECK: bb.1: +# CHECK-NOT: MOVi32imm +name: test12 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %x0, %x1 + + dead %xzr = SUBSXri killed %x0, 7, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w0 = MOVi32imm 7 + STRWui killed %w0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Don't eliminate MOVi32imm 7 in bb.1 as we don't necessarily know the upper 32-bits. +# Note: 32-bit compare/32-bit move imm w/implicit def +# Kill marker should remain on compare. +# CHECK-LABEL: name: test13 +# CHECK: SUBSWri killed %w0, 7, 0, implicit-def %nzcv +# CHECK: bb.1: +# CHECK: MOVi32imm +name: test13 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %w0, %x1 + + dead %wzr = SUBSWri killed %w0, 7, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w0 = MOVi32imm 7, implicit-def %x0 + STRXui killed %x0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# We can't eliminate the MOVi32imm because of the clobbering LDRWui. +# CHECK-LABEL: name: test14 +# CHECK: bb.1: +# CHECK: MOVi32imm +name: test14 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %w0, %x1, %x2 + + dead %wzr = SUBSWri killed %w0, 7, 0, implicit-def %nzcv + %w0 = LDRWui %x1, 0 + STRWui killed %w0, killed %x2, 0 + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w0 = MOVi32imm 7 + STRWui killed %w0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# We can't eliminate the MOVi32imm because of the clobbering LDRWui. +# CHECK-LABEL: name: test15 +# CHECK: bb.1: +# CHECK: MOVi32imm +name: test15 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %w0, %x1, %x2 + + dead %wzr = SUBSWri killed %w0, 7, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1, %x2 + + %w0 = LDRWui %x1, 0 + STRWui killed %w0, killed %x2, 0 + %w0 = MOVi32imm 7 + STRWui killed %w0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Check that bb.0 COPY is seen through to allow the bb.1 MOVi32imm to be removed. +# CHECK-LABEL: name: test16 +# CHECK: bb.1: +# CHECK-NOT: MOVi32imm +name: test16 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %w0, %x1 + + dead %wzr = SUBSWri %w0, 7, 0, implicit-def %nzcv + %w2 = COPY %w0 + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w2 = MOVi32imm 7 + STRWui killed %w2, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Check that bb.1 MOVi32imm is not removed due to self clobbering compare. +# CHECK-LABEL: name: test17 +# CHECK: bb.1: +# CHECK: MOVi32imm +name: test17 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %w0, %x1 + + dead %w0 = SUBSWri killed %w0, 7, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w0 = MOVi32imm 7 + STRWui killed %w0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Make sure the MOVi64imm is not removed. In one version of this patch the +# MOVi64imm immediate was truncated to 32 bits and incorrectly matched because +# the low 32 bits of 4252017623040 are all zero. +# CHECK-LABEL: name: test18 +# CHECK: bb.1: +# CHECK: MOVi64imm +name: test18 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %x0, %x1 + + CBNZX killed %x0, %bb.2 + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %x0 = MOVi64imm 4252017623040 + STRXui killed %x0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Eliminate redundant MOVi32imm -1 in bb.1 +# Note: 32-bit compare/32-bit move imm +# Kill marker should be removed from compare. +# CHECK-LABEL: name: test19 +# CHECK: ADDSWri %w0, 1, 0, implicit-def %nzcv +# CHECK: bb.1: +# CHECK-NOT: MOVi32imm +name: test19 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %w0, %x1 + + dead %wzr = ADDSWri killed %w0, 1, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w0 = MOVi32imm -1 + STRWui killed %w0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Eliminate redundant MOVi64imm -1 in bb.1 +# Note: 64-bit compare/64-bit move imm +# Kill marker should be removed from compare. +# CHECK-LABEL: name: test20 +# CHECK: ADDSXri %x0, 1, 0, implicit-def %nzcv +# CHECK: bb.1: +# CHECK-NOT: MOVi64imm +name: test20 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %x0, %x1 + + dead %xzr = ADDSXri killed %x0, 1, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %x0 = MOVi64imm -1 + STRXui killed %x0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Eliminate redundant MOVi32imm -1 in bb.1 +# Note: 64-bit compare/32-bit move imm +# Kill marker should be removed from compare. +# CHECK-LABEL: name: test21 +# CHECK: ADDSXri %x0, 1, 0, implicit-def %nzcv +# CHECK: bb.1: +# CHECK-NOT: MOVi32imm +name: test21 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %x0, %x1 + + dead %xzr = ADDSXri killed %x0, 1, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w0 = MOVi32imm -1 + STRWui killed %w0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Don't eliminate MOVi64imm -1 in bb.1 as we don't necessarily know the upper 32-bits. +# Note: 32-bit compare/64-bit move imm +# CHECK-LABEL: name: test22 +# CHECK: bb.1: +# CHECK: MOVi64imm +name: test22 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %w0, %x1 + + dead %wzr = ADDSWri killed %w0, 1, 0, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %x0 = MOVi64imm -1 + STRXui killed %x0, killed %x1, 0 + + bb.2: + RET_ReallyLR +... +# Eliminate redundant MOVi32imm 4096 in bb.1 when the compare has a shifted immediate. +# CHECK-LABEL: name: test23 +# CHECK: bb.1: +# CHECK-NOT: MOVi32imm +name: test23 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: %w0, %x1 + + dead %wzr = SUBSWri killed %w0, 1, 12, implicit-def %nzcv + Bcc 1, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x1 + + %w0 = MOVi32imm 4096 + STRWui killed %w0, killed %x1, 0 + + bb.2: + RET_ReallyLR diff --git a/test/CodeGen/AArch64/machine-outliner.ll b/test/CodeGen/AArch64/machine-outliner.ll new file mode 100644 index 000000000000..b5094fe47508 --- /dev/null +++ b/test/CodeGen/AArch64/machine-outliner.ll @@ -0,0 +1,43 @@ +; RUN: llc -enable-machine-outliner -mtriple=aarch64-apple-darwin < %s | FileCheck %s + +define void @cat() #0 { +; CHECK-LABEL: _cat: +; CHECK: b l_OUTLINED_FUNCTION_0 +; CHECK-NOT: ret + %1 = alloca i32, align 4 + %2 = alloca i32, align 4 + %3 = alloca i32, align 4 + %4 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + store i32 1, i32* %2, align 4 + store i32 2, i32* %3, align 4 + store i32 3, i32* %4, align 4 + ret void +} + +define void @dog() #0 { +; CHECK-LABEL: _dog: +; CHECK: b l_OUTLINED_FUNCTION_0 +; CHECK-NOT: ret + %1 = alloca i32, align 4 + %2 = alloca i32, align 4 + %3 = alloca i32, align 4 + %4 = alloca i32, align 4 + store i32 0, i32* %1, align 4 + store i32 1, i32* %2, align 4 + store i32 2, i32* %3, align 4 + store i32 3, i32* %4, align 4 + ret void +} + +; CHECK-LABEL: l_OUTLINED_FUNCTION_0: +; CHECK: orr w8, wzr, #0x1 +; CHECK-NEXT: stp w8, wzr, [sp, #8] +; CHECK-NEXT: orr w8, wzr, #0x2 +; CHECK-NEXT: str w8, [sp, #4] +; CHECK-NEXT: orr w8, wzr, #0x3 +; CHECK-NEXT: str w8, [sp], #16 +; CHECK-NEXT: ret + + +attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" } diff --git a/test/CodeGen/AArch64/mature-mc-support.ll b/test/CodeGen/AArch64/mature-mc-support.ll index 276c54d2cc4e..dbc027143f99 100644 --- a/test/CodeGen/AArch64/mature-mc-support.ll +++ b/test/CodeGen/AArch64/mature-mc-support.ll @@ -9,4 +9,4 @@ module asm " .this_directive_is_very_unlikely_to_exist" -; CHECK: LLVM ERROR: Error parsing inline asm +; CHECK: error: unknown directive diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll index 1d0196ad521d..1d26e4a42b17 100644 --- a/test/CodeGen/AArch64/merge-store.ll +++ b/test/CodeGen/AArch64/merge-store.ll @@ -4,8 +4,7 @@ @g0 = external global <3 x float>, align 16 @g1 = external global <3 x float>, align 4 -; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4 -; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}} +; CHECK: ldr q[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0 ; CHECK: str d[[R0]] define void @blam() { diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll new file mode 100644 index 000000000000..f29dfb3a9802 --- /dev/null +++ b/test/CodeGen/AArch64/misched-fusion-aes.ll @@ -0,0 +1,207 @@ +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57 +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1 + +declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k) +declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %d) +declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d, <16 x i8> %k) +declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %d) + +define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) { + %d0 = load <16 x i8>, <16 x i8>* %a0 + %a1 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 1 + %d1 = load <16 x i8>, <16 x i8>* %a1 + %a2 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 2 + %d2 = load <16 x i8>, <16 x i8>* %a2 + %a3 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 3 + %d3 = load <16 x i8>, <16 x i8>* %a3 + %k0 = load <16 x i8>, <16 x i8>* %b0 + %e00 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d0, <16 x i8> %k0) + %f00 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e00) + %e01 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d1, <16 x i8> %k0) + %f01 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e01) + %e02 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d2, <16 x i8> %k0) + %f02 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e02) + %e03 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d3, <16 x i8> %k0) + %f03 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e03) + %b1 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 1 + %k1 = load <16 x i8>, <16 x i8>* %b1 + %e10 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f00, <16 x i8> %k1) + %f10 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e00) + %e11 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f01, <16 x i8> %k1) + %f11 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e01) + %e12 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f02, <16 x i8> %k1) + %f12 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e02) + %e13 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f03, <16 x i8> %k1) + %f13 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e03) + %b2 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 2 + %k2 = load <16 x i8>, <16 x i8>* %b2 + %e20 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f10, <16 x i8> %k2) + %f20 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e10) + %e21 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f11, <16 x i8> %k2) + %f21 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e11) + %e22 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f12, <16 x i8> %k2) + %f22 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e12) + %e23 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f13, <16 x i8> %k2) + %f23 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e13) + %b3 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 3 + %k3 = load <16 x i8>, <16 x i8>* %b3 + %e30 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f20, <16 x i8> %k3) + %f30 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e20) + %e31 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f21, <16 x i8> %k3) + %f31 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e21) + %e32 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f22, <16 x i8> %k3) + %f32 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e22) + %e33 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f23, <16 x i8> %k3) + %f33 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e23) + %g0 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f30, <16 x i8> %d) + %h0 = xor <16 x i8> %g0, %e + %g1 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f31, <16 x i8> %d) + %h1 = xor <16 x i8> %g1, %e + %g2 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f32, <16 x i8> %d) + %h2 = xor <16 x i8> %g2, %e + %g3 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f33, <16 x i8> %d) + %h3 = xor <16 x i8> %g3, %e + store <16 x i8> %h0, <16 x i8>* %c0 + %c1 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 1 + store <16 x i8> %h1, <16 x i8>* %c1 + %c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2 + store <16 x i8> %h2, <16 x i8>* %c2 + %c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3 + store <16 x i8> %h3, <16 x i8>* %c3 + ret void + +; CHECK-LABEL: aesea: +; CHECKA57: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VC]] +; CHECKA57: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKA57: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKA57: aesmc {{v[0-7].16b}}, [[VB]] +; CHECKA57: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VE]] +; CHECKA57: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VF]] +; CHECKA57: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]] +; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]] +; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VB]] +; CHECKM1: aese {{v[0-7].16b}}, {{v[0-7].16b}} +; CHECKM1: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VC]] +; CHECKM1: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKM1: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VE]] +; CHECKM1: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VF]] +; CHECKM1: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VG]] +; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VH]] +} + +define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) { + %d0 = load <16 x i8>, <16 x i8>* %a0 + %a1 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 1 + %d1 = load <16 x i8>, <16 x i8>* %a1 + %a2 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 2 + %d2 = load <16 x i8>, <16 x i8>* %a2 + %a3 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 3 + %d3 = load <16 x i8>, <16 x i8>* %a3 + %k0 = load <16 x i8>, <16 x i8>* %b0 + %e00 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d0, <16 x i8> %k0) + %f00 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e00) + %e01 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d1, <16 x i8> %k0) + %f01 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e01) + %e02 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d2, <16 x i8> %k0) + %f02 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e02) + %e03 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d3, <16 x i8> %k0) + %f03 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e03) + %b1 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 1 + %k1 = load <16 x i8>, <16 x i8>* %b1 + %e10 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f00, <16 x i8> %k1) + %f10 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e00) + %e11 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f01, <16 x i8> %k1) + %f11 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e01) + %e12 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f02, <16 x i8> %k1) + %f12 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e02) + %e13 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f03, <16 x i8> %k1) + %f13 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e03) + %b2 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 2 + %k2 = load <16 x i8>, <16 x i8>* %b2 + %e20 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f10, <16 x i8> %k2) + %f20 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e10) + %e21 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f11, <16 x i8> %k2) + %f21 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e11) + %e22 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f12, <16 x i8> %k2) + %f22 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e12) + %e23 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f13, <16 x i8> %k2) + %f23 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e13) + %b3 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 3 + %k3 = load <16 x i8>, <16 x i8>* %b3 + %e30 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f20, <16 x i8> %k3) + %f30 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e20) + %e31 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f21, <16 x i8> %k3) + %f31 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e21) + %e32 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f22, <16 x i8> %k3) + %f32 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e22) + %e33 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f23, <16 x i8> %k3) + %f33 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e23) + %g0 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f30, <16 x i8> %d) + %h0 = xor <16 x i8> %g0, %e + %g1 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f31, <16 x i8> %d) + %h1 = xor <16 x i8> %g1, %e + %g2 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f32, <16 x i8> %d) + %h2 = xor <16 x i8> %g2, %e + %g3 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f33, <16 x i8> %d) + %h3 = xor <16 x i8> %g3, %e + store <16 x i8> %h0, <16 x i8>* %c0 + %c1 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 1 + store <16 x i8> %h1, <16 x i8>* %c1 + %c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2 + store <16 x i8> %h2, <16 x i8>* %c2 + %c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3 + store <16 x i8> %h3, <16 x i8>* %c3 + ret void + +; CHECK-LABEL: aesda: +; CHECKA57: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VC]] +; CHECKA57: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKA57: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKA57: aesimc {{v[0-7].16b}}, [[VB]] +; CHECKA57: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VE]] +; CHECKA57: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VF]] +; CHECKA57: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]] +; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]] +; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VB]] +; CHECKM1: aesd {{v[0-7].16b}}, {{v[0-7].16b}} +; CHECKM1: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VC]] +; CHECKM1: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKM1: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VE]] +; CHECKM1: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VF]] +; CHECKM1: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VG]] +; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VH]] +} diff --git a/test/CodeGen/AArch64/misched-fusion-lit.ll b/test/CodeGen/AArch64/misched-fusion-lit.ll new file mode 100644 index 000000000000..45aa67ef1d54 --- /dev/null +++ b/test/CodeGen/AArch64/misched-fusion-lit.ll @@ -0,0 +1,46 @@ +; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=-fuse-literals | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKDONT +; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=+fuse-literals | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE + +@g = common local_unnamed_addr global i8* null, align 8 + +define i8* @litp(i32 %a, i32 %b) { +entry: + %add = add nsw i32 %b, %a + %idx.ext = sext i32 %add to i64 + %add.ptr = getelementptr i8, i8* bitcast (i8* (i32, i32)* @litp to i8*), i64 %idx.ext + store i8* %add.ptr, i8** @g, align 8 + ret i8* %add.ptr + +; CHECK-LABEL: litp: +; CHECK: adrp [[R:x[0-9]+]], litp +; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECKFUSE-NEXT: add {{x[0-9]+}}, [[R]], :lo12:litp +} + +define i32 @liti(i32 %a, i32 %b) { +entry: + %add = add i32 %a, -262095121 + %add1 = add i32 %add, %b + ret i32 %add1 + +; CHECK-LABEL: liti: +; CHECK: mov [[R:w[0-9]+]], {{#[0-9]+}} +; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @litl(i64 %a, i64 %b) { +entry: + %add = add i64 %a, 2208998440489107183 + %add1 = add i64 %add, %b + ret i64 %add1 + +; CHECK-LABEL: litl: +; CHECK: mov [[R:x[0-9]+]], {{#[0-9]+}} +; CHECK-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16 +; CHECK: movk [[R]], {{#[0-9]+}}, lsl #32 +; CHECKDONT-NEXT: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #48 +} diff --git a/test/CodeGen/AArch64/misched-fusion.ll b/test/CodeGen/AArch64/misched-fusion.ll index d5dd9c757dfd..1d504a2f1931 100644 --- a/test/CodeGen/AArch64/misched-fusion.ll +++ b/test/CodeGen/AArch64/misched-fusion.ll @@ -1,22 +1,14 @@ ; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s -target triple = "arm64-apple-ios" +target triple = "aarch64-unknown" declare void @foobar(i32 %v0, i32 %v1) ; Make sure sub is scheduled in front of cbnz ; CHECK-LABEL: test_sub_cbz: -; CHECK: add w[[ADDRES:[0-9]+]], w1, #7 ; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13 -; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]] -; CHECK: mov [[REGTY:[x,w]]]0, [[REGTY]][[ADDRES]] -; CHECK: mov [[REGTY]]1, [[REGTY]][[SUBRES]] -; CHECK: bl _foobar -; CHECK: [[SKIPBLOCK]]: -; CHECK: mov [[REGTY]]0, [[REGTY]][[SUBRES]] -; CHECK: mov [[REGTY]]1, [[REGTY]][[ADDRES]] -; CHECK: bl _foobar +; CHECK-NEXT: cbnz w[[SUBRES]], {{.?LBB[0-9_]+}} define void @test_sub_cbz(i32 %a0, i32 %a1) { entry: ; except for the fusion opportunity the sub/add should be equal so the diff --git a/test/CodeGen/AArch64/movimm-wzr.mir b/test/CodeGen/AArch64/movimm-wzr.mir index 093f85bd9319..60e9bfa03a96 100644 --- a/test/CodeGen/AArch64/movimm-wzr.mir +++ b/test/CodeGen/AArch64/movimm-wzr.mir @@ -1,4 +1,4 @@ -# RUN: llc -run-pass=aarch64-expand-pseudo %s -o - 2>&1 | FileCheck %s +# RUN: llc -run-pass=aarch64-expand-pseudo %s -o - | FileCheck %s --- | ; ModuleID = 'simple.ll' diff --git a/test/CodeGen/AArch64/movw-shift-encoding.ll b/test/CodeGen/AArch64/movw-shift-encoding.ll index 178fccce333b..673bd85bd167 100644 --- a/test/CodeGen/AArch64/movw-shift-encoding.ll +++ b/test/CodeGen/AArch64/movw-shift-encoding.ll @@ -8,8 +8,8 @@ define i32* @get_var() { ret i32* @var -; CHECK: movz x0, #:abs_g3:var // encoding: [0bAAA00000,A,0b111AAAAA,0xd2] -; CHECK: movk x0, #:abs_g2_nc:var // encoding: [0bAAA00000,A,0b110AAAAA,0xf2] -; CHECK: movk x0, #:abs_g1_nc:var // encoding: [0bAAA00000,A,0b101AAAAA,0xf2] -; CHECK: movk x0, #:abs_g0_nc:var // encoding: [0bAAA00000,A,0b100AAAAA,0xf2] +; CHECK: movz x0, #:abs_g0_nc:var // encoding: [0bAAA00000,A,0b100AAAAA,0xd2] +; CHECK: movk x0, #:abs_g1_nc:var // encoding: [0bAAA00000,A,0b101AAAAA,0xf2] +; CHECK: movk x0, #:abs_g2_nc:var // encoding: [0bAAA00000,A,0b110AAAAA,0xf2] +; CHECK: movk x0, #:abs_g3:var // encoding: [0bAAA00000,A,0b111AAAAA,0xf2] } diff --git a/test/CodeGen/AArch64/neon-fma-FMF.ll b/test/CodeGen/AArch64/neon-fma-FMF.ll new file mode 100644 index 000000000000..25beef6592b2 --- /dev/null +++ b/test/CodeGen/AArch64/neon-fma-FMF.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <2 x float> @fma(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = fmul contract <2 x float> %A, %B; + %tmp2 = fadd contract <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <2 x float> @no_fma_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: no_fma_1: +; CHECK: fmul +; CHECK: fadd + %tmp1 = fmul contract <2 x float> %A, %B; + %tmp2 = fadd <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <2 x float> @no_fma_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: no_fma_2: +; CHECK: fmul +; CHECK: fadd + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fadd contract <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <2 x float> @fma_sub(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: fma_sub: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = fmul contract <2 x float> %A, %B; + %tmp2 = fsub contract <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <2 x float> @no_fma_sub_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: no_fma_sub_1: +; CHECK: fmul +; CHECK: fsub + %tmp1 = fmul contract <2 x float> %A, %B; + %tmp2 = fsub <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <2 x float> @no_fma_sub_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; CHECK-LABEL: no_fma_sub_2: +; CHECK: fmul +; CHECK: fsub + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fsub contract <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} diff --git a/test/CodeGen/AArch64/optimize-cond-branch.ll b/test/CodeGen/AArch64/optimize-cond-branch.ll index 4e3ca6f16e78..ab4ad5e2ce93 100644 --- a/test/CodeGen/AArch64/optimize-cond-branch.ll +++ b/test/CodeGen/AArch64/optimize-cond-branch.ll @@ -11,7 +11,7 @@ target triple = "arm64--" ; ; CHECK-LABEL: func ; CHECK-NOT: and -; CHECK: tbnz +; CHECK: tbz define void @func() { %c0 = icmp sgt i64 0, 0 br i1 %c0, label %b1, label %b6 diff --git a/test/CodeGen/AArch64/pr27816.ll b/test/CodeGen/AArch64/pr27816.ll new file mode 100644 index 000000000000..df15755cf3f5 --- /dev/null +++ b/test/CodeGen/AArch64/pr27816.ll @@ -0,0 +1,48 @@ +; RUN: llc %s -mtriple=aarch64 -o - | FileCheck %s + +%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8, i32 } + +; The existence of the final i32 value should not prevent the i8s from +; being merged. + +; CHECK-LABEL: @merge_const_store +; CHECK-NOT: strb +; CHECK: str x8, [x1] +; CHECK-NOT: strb +; CHECK: str wzr, [x1, #8] +; CHECK-NOT: strb +define void @merge_const_store(i32 %count, %struct.A* nocapture %p) { + %1 = icmp sgt i32 %count, 0 + br i1 %1, label %.lr.ph, label %._crit_edge +.lr.ph: + %i.02 = phi i32 [ %add, %.lr.ph ], [ 0, %0 ] + %.01 = phi %struct.A* [ %addr, %.lr.ph ], [ %p, %0 ] + %a2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0 + store i8 1, i8* %a2, align 1 + %a3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1 + store i8 2, i8* %a3, align 1 + %a4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2 + store i8 3, i8* %a4, align 1 + %a5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3 + store i8 4, i8* %a5, align 1 + %a6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4 + store i8 5, i8* %a6, align 1 + %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5 + store i8 6, i8* %a7, align 1 + %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6 + store i8 7, i8* %a8, align 1 + %a9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7 + store i8 8, i8* %a9, align 1 + + ; + %addr_last = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 8 + store i32 0, i32* %addr_last, align 4 + + + %add = add nsw i32 %i.02, 1 + %addr = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1 + %exitcond = icmp eq i32 %add, %count + br i1 %exitcond, label %._crit_edge, label %.lr.ph +._crit_edge: + ret void +} diff --git a/test/CodeGen/AArch64/prefixdata.ll b/test/CodeGen/AArch64/prefixdata.ll new file mode 100644 index 000000000000..f62734c16e52 --- /dev/null +++ b/test/CodeGen/AArch64/prefixdata.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -mtriple=aarch64-apple-darwin | FileCheck --check-prefix=MACHO %s +; RUN: llc < %s -mtriple=aarch64-pc-linux | FileCheck --check-prefix=ELF %s + +@i = linkonce_odr global i32 1 + +; MACHO: ltmp0: +; MACHO-NEXT: .long 1 +; MACHO-NEXT: .alt_entry _f +; MACHO-NEXT: _f: +; ELF: .type f,@function +; ELF-NEXT: .word 1 +; ELF-NEXT: // 0x1 +; ELF-NEXT: f: +define void @f() prefix i32 1 { + ret void +} + +; MACHO: ltmp1: +; MACHO-NEXT: .quad _i +; MACHO-NEXT: .alt_entry _g +; MACHO-NEXT: _g: +; ELF: .type g,@function +; ELF-NEXT: .xword i +; ELF-NEXT: g: +define void @g() prefix i32* @i { + ret void +} + +; MACHO: .subsections_via_symbols diff --git a/test/CodeGen/AArch64/regcoal-physreg.mir b/test/CodeGen/AArch64/regcoal-physreg.mir index c6133991171b..813106366968 100644 --- a/test/CodeGen/AArch64/regcoal-physreg.mir +++ b/test/CodeGen/AArch64/regcoal-physreg.mir @@ -1,5 +1,7 @@ # RUN: llc -mtriple=aarch64-apple-ios -run-pass=simple-register-coalescing %s -o - | FileCheck %s --- | + declare void @f2() + define void @func0() { ret void } define void @func1() { ret void } define void @func2() { ret void } @@ -8,36 +10,25 @@ # Check coalescing of COPYs from reserved physregs. # CHECK-LABEL: name: func0 name: func0 -registers: - - { id: 0, class: gpr32 } - - { id: 1, class: gpr64 } - - { id: 2, class: gpr64 } - - { id: 3, class: gpr32 } - - { id: 4, class: gpr64 } - - { id: 5, class: gpr32 } - - { id: 6, class: xseqpairsclass } - - { id: 7, class: gpr64 } - - { id: 8, class: gpr64sp } - - { id: 9, class: gpr64sp } body: | bb.0: ; We usually should not coalesce copies from allocatable physregs. ; CHECK: %0 = COPY %w7 ; CHECK: STRWui %0, %x1, 0 - %0 = COPY %w7 + %0 : gpr32 = COPY %w7 STRWui %0, %x1, 0 ; It is fine to coalesce copies from reserved physregs ; CHECK-NOT: COPY ; CHECK: STRXui %fp, %x1, 0 - %1 = COPY %fp + %1 : gpr64 = COPY %fp STRXui %1, %x1, 0 ; It is not fine to coalesce copies from reserved physregs when they are ; clobbered. ; CHECK: %2 = COPY %fp ; CHECK: STRXui %2, %x1, 0 - %2 = COPY %fp + %2 : gpr64 = COPY %fp %fp = SUBXri %fp, 4, 0 STRXui %2, %x1, 0 @@ -45,7 +36,7 @@ body: | ; clobbered. ; CHECK-NOT: COPY ; CHECK: STRWui %wzr, %x1 - %3 = COPY %wzr + %3 : gpr32 = COPY %wzr dead %wzr = SUBSWri %w1, 0, 0, implicit-def %nzcv STRWui %3, %x1, 0 @@ -53,13 +44,13 @@ body: | ; clobbered. ; CHECK-NOT: COPY ; CHECK: STRXui %xzr, %x1 - %4 = COPY %xzr + %4 : gpr64 = COPY %xzr dead %wzr = SUBSWri %w1, 0, 0, implicit-def %nzcv STRXui %4, %x1, 0 ; Coalescing COPYs into constant physregs. ; CHECK: %wzr = SUBSWri %w1, 0, 0 - %5 = SUBSWri %w1, 0, 0, implicit-def %nzcv + %5 : gpr32 = SUBSWri %w1, 0, 0, implicit-def %nzcv %wzr = COPY %5 ; Only coalesce when the source register is reserved as a whole (this is @@ -67,12 +58,24 @@ body: | ; of the non-reserved part). ; CHECK: %6 = COPY %x28_fp ; CHECK: HINT 0, implicit %6 - %6 = COPY %x28_fp + %6 : xseqpairsclass = COPY %x28_fp HINT 0, implicit %6 + ; It is not fine to coalesce copies from reserved physregs when they are + ; clobbered by the regmask on a call. + ; CHECK: %7 = COPY %x18 + ; CHECK: BL @f2, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp + ; CHECK: STRXui %7, %x1, 0 + + ; Need a def of x18 so that it's not deduced as "constant". + %x18 = COPY %xzr + %7 : gpr64 = COPY %x18 + BL @f2, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp + STRXui %7, %x1, 0 + ; This can be coalesced. ; CHECK: %fp = SUBXri %fp, 4, 0 - %8 = SUBXri %fp, 4, 0 + %8 : gpr64sp = SUBXri %fp, 4, 0 %fp = COPY %8 ; Cannot coalesce when there are reads of the physreg. @@ -80,7 +83,7 @@ body: | ; CHECK: %9 = SUBXri %fp, 8, 0 ; CHECK: STRXui %fp, %fp, 0 ; CHECK: %fp = COPY %9 - %9 = SUBXri %fp, 8, 0 + %9 : gpr64sp = SUBXri %fp, 8, 0 STRXui %fp, %fp, 0 %fp = COPY %9 ... @@ -88,8 +91,6 @@ body: | # Check coalescing of COPYs from reserved physregs. # CHECK-LABEL: name: func1 name: func1 -registers: - - { id: 0, class: gpr64sp } body: | bb.0: successors: %bb.1, %bb.2 @@ -99,7 +100,7 @@ body: | ; CHECK: %0 = SUBXri %fp, 12, 0 ; CHECK: CBZX undef %x0, %bb.1 ; CHECK: B %bb.2 - %0 = SUBXri %fp, 12, 0 + %0 : gpr64sp = SUBXri %fp, 12, 0 CBZX undef %x0, %bb.1 B %bb.2 @@ -114,8 +115,6 @@ body: | --- # CHECK-LABEL: name: func2 name: func2 -registers: - - { id: 0, class: gpr64sp } body: | bb.0: successors: %bb.1, %bb.2 @@ -123,7 +122,7 @@ body: | ; CHECK-NOT: COPY ; CHECK: CBZX undef %x0, %bb.1 ; CHECK-NEXT: B %bb.2 - %0 = COPY %fp + %0 : gpr64sp = COPY %fp CBZX undef %x0, %bb.1 B %bb.2 diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll index 4bec512403c4..24038cda5078 100644 --- a/test/CodeGen/AArch64/regress-tblgen-chains.ll +++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll @@ -28,7 +28,7 @@ define i64 @test_chains() { ; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]] ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1 ; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]] -; CHECK; and w0, w[[STRVAL]], #0xff +; CHECK: and w0, w[[STRVAL]], #0xff %ret.1 = load i8, i8* %locvar %ret.2 = zext i8 %ret.1 to i64 diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll index 5081a9da3404..80a054beb2a5 100644 --- a/test/CodeGen/AArch64/remat.ll +++ b/test/CodeGen/AArch64/remat.ll @@ -8,7 +8,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m3 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=falkor -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s -; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=vulcan -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx2t99 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mattr=+custom-cheap-as-move -o - %s | FileCheck %s %X = type { i64, i64, i64 } diff --git a/test/CodeGen/AArch64/selectiondag-order.ll b/test/CodeGen/AArch64/selectiondag-order.ll new file mode 100644 index 000000000000..9427906160fd --- /dev/null +++ b/test/CodeGen/AArch64/selectiondag-order.ll @@ -0,0 +1,96 @@ +; Check that debug intrinsics do not affect code generation. + +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+avx | FileCheck --check-prefix=AARCH64-CHECK %s + +define i64 @simulate(<2 x i32> %a) { +entry: + %rand = tail call i64 @lrand48() + br label %body + +body: ; preds = %body, %entry + %0 = phi <2 x i32> [ %add, %body ], [ zeroinitializer, %entry ] + %add = add <2 x i32> %0, %a + %rand1 = tail call i64 @lrand48() #3 + %cmp = icmp eq i64 %rand1, 0 + br i1 %cmp, label %end, label %body + +end: ; preds = %body + %c = bitcast <2 x i32> %add to i64 + %res = add i64 %rand, %c + ret i64 %res +} + +; AARCH64-CHECK: simulate: +; AARCH64-CHECK: movi d9, #0000000000000000 +; AARCH64-CHECK: bl lrand48 +; AARCH64-CHECK: mov x19, x0 +; AARCH64-CHECK: BB0_1: + + +define i64 @simulateWithDebugIntrinsic(<2 x i32> %a) local_unnamed_addr { +entry: + %rand = tail call i64 @lrand48() #3 + tail call void @llvm.dbg.value(metadata i64 %rand, i64 0, metadata !6, metadata !7), !dbg !8 + br label %body + +body: ; preds = %body, %entry + %0 = phi <2 x i32> [ %add, %body ], [ zeroinitializer, %entry ] + %add = add <2 x i32> %0, %a + %rand1 = tail call i64 @lrand48() #3 + %cmp = icmp eq i64 %rand1, 0 + br i1 %cmp, label %end, label %body + +end: ; preds = %body + %c = bitcast <2 x i32> %add to i64 + %res = add i64 %rand, %c + ret i64 %res +} + +; AARCH64-CHECK: simulateWithDebugIntrinsic +; AARCH64-CHECK: movi d9, #0000000000000000 +; AARCH64-CHECK: bl lrand48 +; AARCH64-CHECK: mov x19, x0 +; AARCH64-CHECK: BB1_1: + + +define i64 @simulateWithDbgDeclare(<2 x i32> %a) local_unnamed_addr { +entry: + %rand = tail call i64 @lrand48() #3 + tail call void @llvm.dbg.declare(metadata i64 %rand, metadata !6, metadata !7), !dbg !8 + br label %body + +body: ; preds = %body, %entry + %0 = phi <2 x i32> [ %add, %body ], [ zeroinitializer, %entry ] + %add = add <2 x i32> %0, %a + %rand1 = tail call i64 @lrand48() #3 + %cmp = icmp eq i64 %rand1, 0 + br i1 %cmp, label %end, label %body + +end: ; preds = %body + %c = bitcast <2 x i32> %add to i64 + %res = add i64 %rand, %c + ret i64 %res +} + +; AARCH64-CHECK: simulateWithDbgDeclare: +; AARCH64-CHECK: movi d9, #0000000000000000 +; AARCH64-CHECK: bl lrand48 +; AARCH64-CHECK: mov x19, x0 +; AARCH64-CHECK: BB2_1: + +declare i64 @lrand48() + +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!3, !4} + +!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "test.ll", directory: ".") +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "simulateWithDebugIntrinsic", scope: !2, file: !2, line: 64, isLocal: false, isDefinition: true, scopeLine: 65, unit: !1) +!6 = !DILocalVariable(name: "randv", scope: !5, file: !2, line: 69) +!7 = !DIExpression() +!8 = !DILocation(line: 132, column: 2, scope: !5) diff --git a/test/CodeGen/AArch64/stack-protector-target.ll b/test/CodeGen/AArch64/stack-protector-target.ll index d4d806289bff..787e4a76ec01 100644 --- a/test/CodeGen/AArch64/stack-protector-target.ll +++ b/test/CodeGen/AArch64/stack-protector-target.ll @@ -1,5 +1,7 @@ ; Test target-specific stack cookie location. ; RUN: llc -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefix=ANDROID-AARCH64 %s +; RUN: llc -mtriple=aarch64-fuchsia < %s -o - | FileCheck --check-prefixes=FUCHSIA-AARCH64-COMMON,FUCHSIA-AARCH64-USER %s +; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel < %s -o - | FileCheck --check-prefixes=FUCHSIA-AARCH64-COMMON,FUCHSIA-AARCH64-KERNEL %s define void @_Z1fv() sspreq { entry: @@ -17,3 +19,11 @@ declare void @_Z7CapturePi(i32*) ; ANDROID-AARCH64: ldr [[C:.*]], {{\[}}[[A]], #40] ; ANDROID-AARCH64: ldr [[D:.*]], [sp, ; ANDROID-AARCH64: cmp [[C]], [[D]] + +; FUCHSIA-AARCH64-USER: mrs [[A:.*]], TPIDR_EL0 +; FUCHSIA-AARCH64-KERNEL: mrs [[A:.*]], TPIDR_EL1 +; FUCHSIA-AARCH64-COMMON: ldur [[B:.*]], {{\[}}[[A]], #-16] +; FUCHSIA-AARCH64-COMMON: str [[B]], [sp, +; FUCHSIA-AARCH64-COMMON: ldur [[C:.*]], {{\[}}[[A]], #-16] +; FUCHSIA-AARCH64-COMMON: ldr [[D:.*]], [sp, +; FUCHSIA-AARCH64-COMMON: cmp [[C]], [[D]] diff --git a/test/CodeGen/AArch64/stack_guard_remat.ll b/test/CodeGen/AArch64/stack_guard_remat.ll index d6bae62e5edc..2b7b3485311a 100644 --- a/test/CodeGen/AArch64/stack_guard_remat.ll +++ b/test/CodeGen/AArch64/stack_guard_remat.ll @@ -15,10 +15,10 @@ ; PIC-LINUX: ldr {{x[0-9]+}}, {{\[}}[[R1]]{{\]}} ; STATIC-LARGE: foo2 -; STATIC-LARGE: movz [[R0:x[0-9]+]], #:abs_g3:__stack_chk_guard -; STATIC-LARGE: movk [[R0]], #:abs_g2_nc:__stack_chk_guard +; STATIC-LARGE: movz [[R0:x[0-9]+]], #:abs_g0_nc:__stack_chk_guard ; STATIC-LARGE: movk [[R0]], #:abs_g1_nc:__stack_chk_guard -; STATIC-LARGE: movk [[R0]], #:abs_g0_nc:__stack_chk_guard +; STATIC-LARGE: movk [[R0]], #:abs_g2_nc:__stack_chk_guard +; STATIC-LARGE: movk [[R0]], #:abs_g3:__stack_chk_guard ; STATIC-LARGE: ldr {{x[0-9]+}}, {{\[}}[[R0]]{{\]}} ; STATIC-SMALL: foo2 @@ -29,20 +29,20 @@ define i32 @test_stack_guard_remat() #0 { entry: %a1 = alloca [256 x i32], align 4 %0 = bitcast [256 x i32]* %a1 to i8* - call void @llvm.lifetime.start(i64 1024, i8* %0) + call void @llvm.lifetime.start.p0i8(i64 1024, i8* %0) %arraydecay = getelementptr inbounds [256 x i32], [256 x i32]* %a1, i64 0, i64 0 call void @foo3(i32* %arraydecay) call void asm sideeffect "foo2", "~{w0},~{w1},~{w2},~{w3},~{w4},~{w5},~{w6},~{w7},~{w8},~{w9},~{w10},~{w11},~{w12},~{w13},~{w14},~{w15},~{w16},~{w17},~{w18},~{w19},~{w20},~{w21},~{w22},~{w23},~{w24},~{w25},~{w26},~{w27},~{w28},~{w29},~{w30}"() - call void @llvm.lifetime.end(i64 1024, i8* %0) + call void @llvm.lifetime.end.p0i8(i64 1024, i8* %0) ret i32 0 } ; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) declare void @foo3(i32*) ; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll b/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll deleted file mode 100644 index c2997c50f4d4..000000000000 --- a/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc -O3 -o - -verify-machineinstrs %s | FileCheck %s -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-unknown-linux-gnu" - -%struct.s1 = type { %struct.s3*, %struct.s1* } -%struct.s2 = type opaque -%struct.s3 = type { i32 } - -; Function Attrs: nounwind -define internal fastcc i32 @repeated_dup_worklist(%struct.s1** %pp1, %struct.s2* %p2, i32 %state, i1 %i1_1, i32 %i32_1) unnamed_addr #0 { -entry: - br label %while.cond.outer - -; The loop gets laid out: -; %while.cond.outer -; %(null) -; %(null) -; %dup2 -; and then %dup1 gets chosen as the next block. -; when dup2 is duplicated into dup1, %worklist could erroneously be placed on -; the worklist, because all of its current predecessors are now scheduled. -; However, after dup2 is tail-duplicated, %worklist can't be on the worklist -; because it now has unscheduled predecessors.q -; CHECK-LABEL: repeated_dup_worklist -; CHECK: // %entry -; CHECK: // %while.cond.outer -; first %(null) block -; CHECK: // in Loop: -; CHECK: ldr -; CHECK-NEXT: tbnz -; second %(null) block -; CHECK: // in Loop: -; CHECK: // %dup2 -; CHECK: // %worklist -; CHECK: // %if.then96.i -while.cond.outer: ; preds = %dup1, %entry - %progress.0.ph = phi i32 [ 0, %entry ], [ %progress.1, %dup1 ] - %inc77 = add nsw i32 %progress.0.ph, 1 - %cmp = icmp slt i32 %progress.0.ph, %i32_1 - br i1 %cmp, label %dup2, label %dup1 - -dup2: ; preds = %if.then96.i, %worklist, %while.cond.outer - %progress.1.ph = phi i32 [ 0, %while.cond.outer ], [ %progress.1, %if.then96.i ], [ %progress.1, %worklist ] - %.pr = load %struct.s1*, %struct.s1** %pp1, align 8 - br label %dup1 - -dup1: ; preds = %dup2, %while.cond.outer - %0 = phi %struct.s1* [ %.pr, %dup2 ], [ undef, %while.cond.outer ] - %progress.1 = phi i32 [ %progress.1.ph, %dup2 ], [ %inc77, %while.cond.outer ] - br i1 %i1_1, label %while.cond.outer, label %worklist - -worklist: ; preds = %dup1 - %snode94 = getelementptr inbounds %struct.s1, %struct.s1* %0, i64 0, i32 0 - %1 = load %struct.s3*, %struct.s3** %snode94, align 8 - %2 = getelementptr inbounds %struct.s3, %struct.s3* %1, i32 0, i32 0 - %3 = load i32, i32* %2, align 4 - %tobool95.i = icmp eq i32 %3, 0 - br i1 %tobool95.i, label %if.then96.i, label %dup2 - -if.then96.i: ; preds = %worklist - call fastcc void @free_s3(%struct.s2* %p2, %struct.s3* %1) #1 - br label %dup2 -} - -; Function Attrs: nounwind -declare fastcc void @free_s3(%struct.s2*, %struct.s3*) unnamed_addr #0 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } diff --git a/test/CodeGen/AArch64/tailcall-string-rvo.ll b/test/CodeGen/AArch64/tailcall-string-rvo.ll new file mode 100644 index 000000000000..bdc09235afd9 --- /dev/null +++ b/test/CodeGen/AArch64/tailcall-string-rvo.ll @@ -0,0 +1,47 @@ +; RUN: llc -relocation-model=static -verify-machineinstrs -O2 < %s | FileCheck %s + +; The call to function TestBar should be a tail call, when in C++ the string +; `ret` is RVO returned. +; string TestFoo() { +; string ret = undef; +; TestBar(&ret); // tail call optimized +; return ret; +; } + +target triple = "aarch64-linux-gnu" + +%class.basic_string.11.42.73 = type { %"class.__gnu_cxx::__versa_string.10.41.72" } +%"class.__gnu_cxx::__versa_string.10.41.72" = type { %"class.__gnu_cxx::__sso_string_base.9.40.71" } +%"class.__gnu_cxx::__sso_string_base.9.40.71" = type { %"struct.__gnu_cxx::__vstring_utility, std::allocator >::_Alloc_hider.7.38.69", i64, %union.anon.8.39.70 } +%"struct.__gnu_cxx::__vstring_utility, std::allocator >::_Alloc_hider.7.38.69" = type { i8* } +%union.anon.8.39.70 = type { i64, [8 x i8] } + +declare void @TestBaz(%class.basic_string.11.42.73* noalias sret %arg) + +define void @TestBar(%class.basic_string.11.42.73* noalias sret %arg) { +bb: + call void @TestBaz(%class.basic_string.11.42.73* noalias sret %arg) + ret void +} + +define void @TestFoo(%class.basic_string.11.42.73* noalias sret %arg) { +; CHECK-LABEL: TestFoo: +; CHECK: b TestBar +bb: + %tmp = getelementptr inbounds %class.basic_string.11.42.73, %class.basic_string.11.42.73* %arg, i64 0, i32 0, i32 0, i32 2 + %tmp1 = bitcast %class.basic_string.11.42.73* %arg to %union.anon.8.39.70** + store %union.anon.8.39.70* %tmp, %union.anon.8.39.70** %tmp1, align 8 + %tmp2 = bitcast %union.anon.8.39.70* %tmp to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* nonnull undef, i64 13, i32 1, i1 false) + %tmp3 = getelementptr inbounds %class.basic_string.11.42.73, %class.basic_string.11.42.73* %arg, i64 0, i32 0, i32 0, i32 1 + store i64 13, i64* %tmp3, align 8 + %tmp4 = getelementptr inbounds %class.basic_string.11.42.73, %class.basic_string.11.42.73* %arg, i64 0, i32 0, i32 0, i32 2, i32 1, i64 5 + store i8 0, i8* %tmp4, align 1 + tail call void @TestBar(%class.basic_string.11.42.73* noalias sret %arg) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0 + +attributes #0 = { argmemonly nounwind } diff --git a/test/CodeGen/AArch64/tbz-tbnz.ll b/test/CodeGen/AArch64/tbz-tbnz.ll index 0dd265c18ec7..7ef78ca52a24 100644 --- a/test/CodeGen/AArch64/tbz-tbnz.ll +++ b/test/CodeGen/AArch64/tbz-tbnz.ll @@ -10,7 +10,7 @@ entry: br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -28,7 +28,7 @@ entry: br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:x[0-9]+]], x0, #12 -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() @@ -118,7 +118,7 @@ entry: br i1 %cmp, label %if.then, label %if.end ; CHECK: sub [[CMP:w[0-9]+]], w0, #12 -; CHECK: tbz [[CMP]], #31 +; CHECK: tbnz [[CMP]], #31 if.then: call void @t() @@ -178,7 +178,7 @@ define void @test9(i64 %val1) { br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -194,7 +194,7 @@ define void @test10(i64 %val1) { br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -209,7 +209,7 @@ define void @test11(i64 %val1, i64* %ptr) { ; CHECK: ldr [[CMP:x[0-9]+]], [x1] ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 %val = load i64, i64* %ptr %tst = icmp slt i64 %val, 0 @@ -229,7 +229,7 @@ define void @test12(i64 %val1) { br i1 %tst, label %if.then, label %if.end ; CHECK-NOT: cmp -; CHECK: tbz x0, #63 +; CHECK: tbnz x0, #63 if.then: call void @t() @@ -247,7 +247,7 @@ define void @test13(i64 %val1, i64 %val2) { ; CHECK: orr [[CMP:x[0-9]+]], x0, x1 ; CHECK-NOT: cmp -; CHECK: tbz [[CMP]], #63 +; CHECK: tbnz [[CMP]], #63 if.then: call void @t() diff --git a/test/CodeGen/AArch64/thread-pointer.ll b/test/CodeGen/AArch64/thread-pointer.ll new file mode 100644 index 000000000000..91585791a58e --- /dev/null +++ b/test/CodeGen/AArch64/thread-pointer.ll @@ -0,0 +1,60 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s + +@x = thread_local local_unnamed_addr global i32 0, align 4 +@y = thread_local local_unnamed_addr global i32 0, align 4 + +; Machine LICM should hoist the mrs into the loop preheader. +; CHECK-LABEL: @test1 +; CHECK: BB#1: +; CHECK: mrs x[[BASE:[0-9]+]], TPIDR_EL0 +; CHECK: add x[[REG1:[0-9]+]], x[[BASE]], :tprel_hi12:x +; CHECK: add x[[REG2:[0-9]+]], x[[REG1]], :tprel_lo12_nc:x +; +; CHECK: .LBB0_2: +; CHECK: ldr w0, [x[[REG2]]] +; CHECK: bl bar +; CHECK: sub w[[REG3:[0-9]+]], w{{[0-9]+}}, #1 +; CHECK: cbnz w[[REG3]], .LBB0_2 + +define void @test1(i32 %n) local_unnamed_addr { +entry: + %cmp3 = icmp sgt i32 %n, 0 + br i1 %cmp3, label %bb1, label %bb2 + +bb1: + br label %for.body + +for.body: + %i.04 = phi i32 [ %inc, %for.body ], [ 0, %bb1 ] + %0 = load i32, i32* @x, align 4 + tail call void @bar(i32 %0) #2 + %inc = add nuw nsw i32 %i.04, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %bb2, label %for.body + +bb2: + ret void +} + +; Machine CSE should combine the the mrs between the load of %x and %y. +; CHECK-LABEL: @test2 +; CHECK: mrs x{{[0-9]+}}, TPIDR_EL0 +; CHECK-NOT: mrs x{{[0-9]+}}, TPIDR_EL0 +; CHECK: ret +define void @test2(i32 %c) local_unnamed_addr #0 { +entry: + %0 = load i32, i32* @x, align 4 + tail call void @bar(i32 %0) #2 + %cmp = icmp eq i32 %c, 0 + br i1 %cmp, label %if.end, label %if.then + +if.then: + %1 = load i32, i32* @y, align 4 + tail call void @bar(i32 %1) #2 + br label %if.end + +if.end: + ret void +} + +declare void @bar(i32) local_unnamed_addr diff --git a/test/CodeGen/AArch64/vector_merge_dep_check.ll b/test/CodeGen/AArch64/vector_merge_dep_check.ll index 9220947e8362..e4e64ef8c8db 100644 --- a/test/CodeGen/AArch64/vector_merge_dep_check.ll +++ b/test/CodeGen/AArch64/vector_merge_dep_check.ll @@ -1,5 +1,4 @@ -; RUN: llc --combiner-alias-analysis=false < %s | FileCheck %s -; RUN: llc --combiner-alias-analysis=true < %s | FileCheck %s +; RUN: llc < %s | FileCheck %s ; This test checks that we do not merge stores together which have ; dependencies through their non-chain operands (e.g. one store is the diff --git a/test/CodeGen/AArch64/xray-tail-call-sled.ll b/test/CodeGen/AArch64/xray-tail-call-sled.ll new file mode 100644 index 000000000000..6ada3ce8d551 --- /dev/null +++ b/test/CodeGen/AArch64/xray-tail-call-sled.ll @@ -0,0 +1,69 @@ +; RUN: llc -filetype=asm -o - -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +define i32 @callee() nounwind noinline uwtable "function-instrument"="xray-always" { +; CHECK: .p2align 2 +; CHECK-LABEL: .Lxray_sled_0: +; CHECK-NEXT: b #32 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-LABEL: .Ltmp0: + ret i32 0 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: .p2align 2 +; CHECK-LABEL: .Lxray_sled_1: +; CHECK-NEXT: b #32 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-LABEL: .Ltmp1: +; CHECK-NEXT: ret +} +; CHECK: .p2align 4 +; CHECK-NEXT: .xword .Lxray_synthetic_0 +; CHECK-NEXT: .section xray_instr_map,{{.*}} +; CHECK-LABEL: Lxray_synthetic_0: +; CHECK: .xword .Lxray_sled_0 +; CHECK: .xword .Lxray_sled_1 + +define i32 @caller() nounwind noinline uwtable "function-instrument"="xray-always" { +; CHECK: .p2align 2 +; CHECK-LABEL: .Lxray_sled_2: +; CHECK-NEXT: b #32 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-LABEL: .Ltmp2: +; CHECK: .p2align 2 +; CHECK-LABEL: .Lxray_sled_3: +; CHECK-NEXT: b #32 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-LABEL: .Ltmp3: + %retval = tail call i32 @callee() +; CHECK: b callee + ret i32 %retval +} +; CHECK: .p2align 4 +; CHECK-NEXT: .xword .Lxray_synthetic_1 +; CHECK-NEXT: .section xray_instr_map,{{.*}} +; CHECK-LABEL: Lxray_synthetic_1: +; CHECK: .xword .Lxray_sled_2 +; CHECK: .xword .Lxray_sled_3 diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll index edad18e244d0..ca661cf9a712 100644 --- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll +++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -13,7 +13,7 @@ ; FUNC-LABEL: {{^}}local_address_load: ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]] ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] -define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { entry: %0 = load i32, i32 addrspace(3)* %in store i32 %0, i32 addrspace(1)* %out @@ -24,7 +24,7 @@ entry: ; SI: s_add_i32 [[SPTR:s[0-9]]] ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; SI: ds_read_b32 [[VPTR]] -define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) { +define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) { entry: %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset %1 = load i32, i32 addrspace(3)* %0 @@ -35,7 +35,7 @@ entry: ; FUNC-LABEL: {{^}}local_address_gep_const_offset: ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4 -define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { entry: %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1 %1 = load i32, i32 addrspace(3)* %0 @@ -48,7 +48,7 @@ entry: ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; SI: ds_read_b32 [[VPTR]] -define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { entry: %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385 %1 = load i32, i32 addrspace(3)* %0 @@ -60,7 +60,7 @@ entry: ; SI: v_cmp_ne_u32 ; SI-NOT: v_cmp_ne_u32 ; SI: v_cndmask_b32 -define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind { +define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind { %cmp = icmp ne i32 addrspace(3)* %lds, null %x = select i1 %cmp, i32 123, i32 456 store i32 %x, i32 addrspace(1)* %out @@ -71,7 +71,7 @@ define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) ; SI: s_mul_i32 ; SI-NEXT: s_add_i32 ; SI: ds_read_b32 -define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) { +define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) { %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0 %val = load float, float addrspace(3)* %ptr store float %val, float addrspace(1)* %out @@ -83,7 +83,7 @@ define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* % ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]] -define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { +define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { %val = load float, float addrspace(3)* @g_lds store float %val, float addrspace(1)* %out ret void @@ -95,14 +95,14 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti ; FUNC-LABEL: {{^}}global_ptr: ; SI: ds_write_b32 -define void @global_ptr() nounwind { +define amdgpu_kernel void @global_ptr() nounwind { store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr ret void } ; FUNC-LABEL: {{^}}local_address_store: ; SI: ds_write_b32 -define void @local_address_store(i32 addrspace(3)* %out, i32 %val) { +define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) { store i32 %val, i32 addrspace(3)* %out ret void } @@ -111,7 +111,7 @@ define void @local_address_store(i32 addrspace(3)* %out, i32 %val) { ; SI: s_add_i32 [[SADDR:s[0-9]+]], ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]] ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}} -define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) { +define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) { %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset store i32 %val, i32 addrspace(3)* %gep, align 4 ret void @@ -121,7 +121,7 @@ define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4 -define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) { +define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) { %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1 store i32 %val, i32 addrspace(3)* %gep, align 4 ret void @@ -132,7 +132,7 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}} -define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) { +define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) { %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385 store i32 %val, i32 addrspace(3)* %gep, align 4 ret void diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir new file mode 100644 index 000000000000..56a9e7022db9 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir @@ -0,0 +1,28 @@ +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN + +# REQUIRES: global-isel + +--- | + define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void } +... +--- + +name: global_addrspace +legalized: true +regBankSelected: true + +# GCN: global_addrspace +# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1 +# GCN: FLAT_LOAD_DWORD [[PTR]], 0, 0, 0 + +body: | + bb.0: + liveins: %vgpr0_vgpr1 + + %0:vgpr(p1) = COPY %vgpr0_vgpr1 + %1:vgpr(s32) = G_LOAD %0 :: (load 4 from %ir.global0) + %vgpr0 = COPY %1 + +... +--- diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir new file mode 100644 index 000000000000..ea2ad2ba83a5 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -0,0 +1,142 @@ +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,SI,SICI,SIVI +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,CI,SICI +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,VI,SIVI + +# REQUIRES: global-isel + +--- | + define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void } +... +--- + +name: smrd_imm +legalized: true +regBankSelected: true + +# GCN: body: +# GCN: [[PTR:%[0-9]+]] = COPY %sgpr0_sgpr1 + +# Immediate offset: +# SICI: S_LOAD_DWORD_IMM [[PTR]], 1, 0 +# VI: S_LOAD_DWORD_IMM [[PTR]], 4, 0 + +# Max immediate offset for SI +# SICI: S_LOAD_DWORD_IMM [[PTR]], 255, 0 +# VI: S_LOAD_DWORD_IMM [[PTR]], 1020, 0 + +# Immediate overflow for SI +# SI: [[K1024:%[0-9]+]] = S_MOV_B32 1024 +# SI: S_LOAD_DWORD_SGPR [[PTR]], [[K1024]], 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 256, 0 +# VI: S_LOAD_DWORD_IMM [[PTR]], 1024, 0 + +# Max immediate offset for VI +# SI: [[K1048572:%[0-9]+]] = S_MOV_B32 1048572 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262143 +# VI: S_LOAD_DWORD_IMM [[PTR]], 1048572 + +# +# Immediate overflow for VI +# SIVI: [[K1048576:%[0-9]+]] = S_MOV_B32 1048576 +# SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K1048576]], 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262144, 0 + +# Max immediate for CI +# SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 4294967292 +# SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 3 +# SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2 +# SIVI-DAG: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0 +# SIVI-DAG: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0 +# SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]] +# SIVI-DAG: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1 +# SIVI-DAG: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1 +# SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]] +# SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2 +# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 4294967295, 0 + +# Immediate overflow for CI +# GCN: [[K_LO:%[0-9]+]] = S_MOV_B32 0 +# GCN: [[K_HI:%[0-9]+]] = S_MOV_B32 4 +# GCN: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2 +# GCN-DAG: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0 +# GCN-DAG: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0 +# GCN: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]] +# GCN-DAG: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1 +# GCN-DAG: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1 +# GCN: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]] +# GCN: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2 +# GCN: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 + +# Max 32-bit byte offset +# SIVI: [[K4294967292:%[0-9]+]] = S_MOV_B32 4294967292 +# SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K4294967292]], 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741823, 0 + +# Overflow 32-bit byte offset +# SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 0 +# SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 1 +# SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2 +# SIVI-DAG: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0 +# SIVI-DAG: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0 +# SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]] +# SIVI-DAG: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1 +# SIVI-DAG: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1 +# SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]] +# SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2 +# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741824, 0 + +body: | + bb.0: + liveins: %sgpr0_sgpr1 + + %0:sgpr(p2) = COPY %sgpr0_sgpr1 + + %1:sgpr(s64) = G_CONSTANT i64 4 + %2:sgpr(p2) = G_GEP %0, %1 + %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0) + %sgpr0 = COPY %3 + + %4:sgpr(s64) = G_CONSTANT i64 1020 + %5:sgpr(p2) = G_GEP %0, %4 + %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0) + %sgpr0 = COPY %6 + + %7:sgpr(s64) = G_CONSTANT i64 1024 + %8:sgpr(p2) = G_GEP %0, %7 + %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0) + %sgpr0 = COPY %9 + + %10:sgpr(s64) = G_CONSTANT i64 1048572 + %11:sgpr(p2) = G_GEP %0, %10 + %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0) + %sgpr0 = COPY %12 + + %13:sgpr(s64) = G_CONSTANT i64 1048576 + %14:sgpr(p2) = G_GEP %0, %13 + %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0) + %sgpr0 = COPY %15 + + %16:sgpr(s64) = G_CONSTANT i64 17179869180 + %17:sgpr(p2) = G_GEP %0, %16 + %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0) + %sgpr0 = COPY %18 + + %19:sgpr(s64) = G_CONSTANT i64 17179869184 + %20:sgpr(p2) = G_GEP %0, %19 + %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0) + %sgpr0 = COPY %21 + + %22:sgpr(s64) = G_CONSTANT i64 4294967292 + %23:sgpr(p2) = G_GEP %0, %22 + %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0) + %sgpr0 = COPY %24 + + %25:sgpr(s64) = G_CONSTANT i64 4294967296 + %26:sgpr(p2) = G_GEP %0, %25 + %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0) + %sgpr0 = COPY %27 + +... +--- diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir new file mode 100644 index 000000000000..ea435725bf25 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir @@ -0,0 +1,29 @@ +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN + +# REQUIRES: global-isel + +--- | + define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void } +... +--- + +name: global_addrspace +legalized: true +regBankSelected: true + +# GCN: global_addrspace +# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1 +# GCN: [[VAL:%[0-9]+]] = COPY %vgpr2 +# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0, 0 + +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2 + + %0:vgpr(p1) = COPY %vgpr0_vgpr1 + %1:vgpr(s32) = COPY %vgpr2 + G_STORE %1, %0 :: (store 4 into %ir.global0) + +... +--- diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir new file mode 100644 index 000000000000..3496b1ab71fe --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir @@ -0,0 +1,69 @@ +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=regbankselect -global-isel %s -verify-machineinstrs -o - | FileCheck %s + +# REQUIRES: global-isel + +--- | + define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void } + define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) { + %tmp0 = load i32, i32 addrspace(1)* %ptr1 + ret void + } + define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0 + %tmp2 = load i32, i32 addrspace(1)* %tmp1 + ret void + } + declare i32 @llvm.amdgcn.workitem.id.x() #0 + attributes #0 = { nounwind readnone } +... + +--- +name : load_constant +legalized: true + +# CHECK-LABEL: name: load_constant +# CHECK: registers: +# CHECK: - { id: 0, class: sgpr } +# CHECK: - { id: 1, class: sgpr } + +body: | + bb.0: + liveins: %sgpr0_sgpr1 + %0:_(p2) = COPY %sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0) +... + +--- +name: load_global_uniform +legalized: true + +# CHECK-LABEL: name: load_global_uniform +# CHECK: registers: +# CHECK: - { id: 0, class: sgpr } +# CHECK: - { id: 1, class: sgpr } + +body: | + bb.0: + liveins: %sgpr0_sgpr1 + %0:_(p1) = COPY %sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr1) +... + +--- +name: load_global_non_uniform +legalized: true + +# CHECK-LABEL: name: load_global_non_uniform +# CHECK: registers: +# CHECK: - { id: 0, class: sgpr } +# CHECK: - { id: 1, class: vgpr } +# CHECK: - { id: 2, class: vgpr } + + +body: | + bb.0: + liveins: %sgpr0_sgpr1 + %0:_(p1) = COPY %sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.tmp1) +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll b/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll new file mode 100644 index 000000000000..a1bf987e6552 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll @@ -0,0 +1,11 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=GCN %s + +; REQUIRES: global-isel + +; GCN-LABEL: vs_epilog +; GCN: s_endpgm + +define amdgpu_vs void @vs_epilog() { +main_body: + ret void +} diff --git a/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/test/CodeGen/AMDGPU/GlobalISel/smrd.ll new file mode 100644 index 000000000000..8a6b3df9cff8 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -0,0 +1,89 @@ +; FIXME: Need to add support for mubuf stores to enable this on SI. +; XUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s +; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=CI --check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s + +; REQUIRES: global-isel + +; SMRD load with an immediate offset. +; GCN-LABEL: {{^}}smrd0: +; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 +define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with the largest possible immediate offset. +; GCN-LABEL: {{^}}smrd1: +; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc +define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with an offset greater than the largest possible immediate. +; GCN-LABEL: {{^}}smrd2: +; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; GCN: s_endpgm +define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with a 64-bit offset +; GCN-LABEL: {{^}}smrd3: +; FIXME: There are too many copies here because we don't fold immediates +; through REG_SEQUENCE +; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b +; TODO: Add VI checks +; XGCN: s_endpgm +define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with the largest possible immediate offset on VI +; GCN-LABEL: {{^}}smrd4: +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with an offset greater than the largest possible immediate on VI +; GCN-LABEL: {{^}}smrd5: +; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 +; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 +; GCN: s_endpgm +define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + diff --git a/test/CodeGen/AMDGPU/add-debug.ll b/test/CodeGen/AMDGPU/add-debug.ll index 529905dd36a2..b90c20b97482 100644 --- a/test/CodeGen/AMDGPU/add-debug.ll +++ b/test/CodeGen/AMDGPU/add-debug.ll @@ -3,7 +3,7 @@ ; REQUIRES: asserts ; Check that SelectionDAGDumper does not crash on int_SI_if. -define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll index 6c5cdd3877d1..3b274c9d2027 100644 --- a/test/CodeGen/AMDGPU/add.i16.ll +++ b/test/CodeGen/AMDGPU/add.i16.ll @@ -6,7 +6,7 @@ ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -23,7 +23,7 @@ define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -38,7 +38,7 @@ define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* % ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -53,7 +53,7 @@ define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -69,7 +69,7 @@ define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1) ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: buffer_store_dword [[ADD]] -define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -84,12 +84,12 @@ define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -109,7 +109,7 @@ define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] -define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -130,7 +130,7 @@ define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll index a6247c735240..7e4546d2cfb3 100644 --- a/test/CodeGen/AMDGPU/add.ll +++ b/test/CodeGen/AMDGPU/add.ll @@ -8,7 +8,7 @@ ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}} ;SI-NOT: [[REG]] ;SI: buffer_store_dword [[REG]], -define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -24,7 +24,7 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -44,7 +44,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -71,7 +71,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ; SI: s_add_i32 ; SI: s_add_i32 ; SI: s_add_i32 -define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { +define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { entry: %0 = add <8 x i32> %a, %b store <8 x i32> %0, <8 x i32> addrspace(1)* %out @@ -112,7 +112,7 @@ entry: ; SI: s_add_i32 ; SI: s_add_i32 ; SI: s_add_i32 -define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { +define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { entry: %0 = add <16 x i32> %a, %b store <16 x i32> %0, <16 x i32> addrspace(1)* %out @@ -129,7 +129,7 @@ entry: ; EG-DAG: ADD_INT ; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB -define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: %0 = add i64 %a, %b store i64 %0, i64 addrspace(1)* %out @@ -150,7 +150,7 @@ entry: ; EG-DAG: ADD_INT ; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB -define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { +define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { entry: %0 = load i64, i64 addrspace(1)* %in %1 = add i64 %a, %0 @@ -169,7 +169,7 @@ entry: ; EG-DAG: ADD_INT ; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB -define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll new file mode 100644 index 000000000000..e137ef4bc236 --- /dev/null +++ b/test/CodeGen/AMDGPU/add.v2i16.ll @@ -0,0 +1,283 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16: +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_v2i16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]] + +; VI: s_add_i32 +; VI: s_add_i32 +define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 + %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1 + %add = add <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_self_v2i16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL]], [[VAL]] + +; VI: s_add_i32 +; VI: s_add_i32 +define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 + %add = add <2 x i16> %a, %a + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: VI should not scalarize arg access. +; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg: +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} + +; VI: v_add_i32 +; VI: v_add_i32_sdwa +define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { + %add = add <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_v2i16_constant: +; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} + +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}} +define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant: +; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} + +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}} +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}} +define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1: +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}} + +; VI: flat_load_ushort [[LOAD0:v[0-9]+]] +; VI: flat_load_ushort [[LOAD1:v[0-9]+]] +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD0]] +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]] +; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi: +; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} + +; VI-NOT: v_add_u16 +; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}} +; VI-NOT: v_add_u16 +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; The high element gives fp +; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split: +; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0 +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} + +; VI-NOT: v_add_u16 +; VI: v_add_u16_e32 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}} +; VI-NOT: v_add_u16 +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i32: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} + +; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_ushort v[[A_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_ushort v[[B_LO:[0-9]+]] + +; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] +; VI-NOT: and +; VI-NOT: shl +; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; VI-NOT: and +; VI-NOT: shl +; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} +define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + %ext = zext <2 x i16> %add to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx4 + +; VI: flat_load_ushort v[[A_LO:[0-9]+]] +; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_ushort v[[B_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] + +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI: v_add_u16_e32 +; VI: v_add_u16_e32 + +; VI: buffer_store_dwordx4 +define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + %ext = zext <2 x i16> %add to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i32: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16 +; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} + +; VI: v_add_u16_e32 +; VI: v_add_u16_e32 +; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; VI: buffer_store_dwordx2 +define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + %ext = sext <2 x i16> %add to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i64: +; GCN: flat_load_dword +; GCN: flat_load_dword + +; GFX9: v_pk_add_u16 +; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} + +; VI: v_add_u16_sdwa +; VI: v_add_u16_e32 + +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + %ext = sext <2 x i16> %add to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/add_i128.ll b/test/CodeGen/AMDGPU/add_i128.ll index c80157ca9c58..00a125c2e44f 100644 --- a/test/CodeGen/AMDGPU/add_i128.ll +++ b/test/CodeGen/AMDGPU/add_i128.ll @@ -6,7 +6,7 @@ ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]], -define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) { +define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid @@ -23,7 +23,7 @@ define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* ; GCN: v_addc_u32 ; GCN: v_addc_u32 ; GCN: v_addc_u32 -define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { +define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %foo, %a store i128 %result, i128 addrspace(1)* %out @@ -35,7 +35,7 @@ define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* no ; GCN: v_addc_u32 ; GCN: v_addc_u32 ; GCN: v_addc_u32 -define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { +define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %a, %foo store i128 %result, i128 addrspace(1)* %out @@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspa ; GCN: s_addc_u32 ; GCN: s_addc_u32 ; GCN: s_addc_u32 -define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) { +define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) { %result = add i128 %a, %b store i128 %result, i128 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll index 3d360b7d0b7a..62733d5bfb6c 100644 --- a/test/CodeGen/AMDGPU/add_i64.ll +++ b/test/CodeGen/AMDGPU/add_i64.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone ; SI-LABEL: {{^}}test_i64_vreg: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) { +define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid @@ -21,7 +21,7 @@ define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa ; SI-LABEL: {{^}}sgpr_operand: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { +define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %foo, %a store i64 %result, i64 addrspace(1)* %out @@ -34,7 +34,7 @@ define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noal ; SI-LABEL: {{^}}sgpr_operand_reversed: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { +define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %a, %foo store i64 %result, i64 addrspace(1)* %out @@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace ; SI: s_addc_u32 ; SI: s_add_u32 ; SI: s_addc_u32 -define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) { +define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) { %result = add <2 x i64> %a, %b store <2 x i64> %result, <2 x i64> addrspace(1)* %out ret void @@ -58,7 +58,7 @@ define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, ; SI: v_addc_u32 ; SI: v_add_i32 ; SI: v_addc_u32 -define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { +define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid @@ -76,7 +76,7 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add ; SI-NOT: addc ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], -define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { %add = add i64 %b, %a %trunc = trunc i64 %add to i32 store i32 %trunc, i32 addrspace(1)* %out, align 8 diff --git a/test/CodeGen/AMDGPU/addrspacecast-captured.ll b/test/CodeGen/AMDGPU/addrspacecast-captured.ll new file mode 100644 index 000000000000..138bc36b9e1b --- /dev/null +++ b/test/CodeGen/AMDGPU/addrspacecast-captured.ll @@ -0,0 +1,47 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s + +; Nothing should be done if the addrspacecast is captured. + +declare void @consume_ptr2int(i32) #0 + +; CHECK-LABEL: @addrspacecast_captured( +; CHECK: %data = alloca i32, align 4 +; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)* +; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 +; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out +define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 { +entry: + %data = alloca i32, align 4 + %cast = addrspacecast i32* %data to i32 addrspace(4)* + %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 + store i32 %ptr2int, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @addrspacecast_captured_store( +; CHECK: %data = alloca i32, align 4 +; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)* +; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out +define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 { +entry: + %data = alloca i32, align 4 + %cast = addrspacecast i32* %data to i32 addrspace(4)* + store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @addrspacecast_captured_call( +; CHECK: %data = alloca i32, align 4 +; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)* +; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 +; CHECK: call void @consume_ptr2int(i32 %ptr2int) +define amdgpu_kernel void @addrspacecast_captured_call() #0 { +entry: + %data = alloca i32, align 4 + %cast = addrspacecast i32* %data to i32 addrspace(4)* + %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32 + call void @consume_ptr2int(i32 %ptr2int) + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index 67a193999204..8cabc7dae133 100644 --- a/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -9,57 +9,57 @@ declare void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* nocapture, i32 addrs @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4 ; HSA: @store_cast_0_flat_to_group_addrspacecast() #1 -define void @store_cast_0_flat_to_group_addrspacecast() #1 { +define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 { store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*) ret void } ; HSA: @store_cast_0_group_to_flat_addrspacecast() #2 -define void @store_cast_0_group_to_flat_addrspacecast() #1 { +define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 { store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*) ret void } -; HSA: define void @store_constant_cast_group_gv_to_flat() #2 -define void @store_constant_cast_group_gv_to_flat() #1 { +; HSA: define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #2 +define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 { store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*) ret void } ; HSA: @store_constant_cast_group_gv_gep_to_flat() #2 -define void @store_constant_cast_group_gv_gep_to_flat() #1 { +define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 { store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) ret void } ; HSA: @store_constant_cast_global_gv_to_flat() #1 -define void @store_constant_cast_global_gv_to_flat() #1 { +define amdgpu_kernel void @store_constant_cast_global_gv_to_flat() #1 { store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*) ret void } ; HSA: @store_constant_cast_global_gv_gep_to_flat() #1 -define void @store_constant_cast_global_gv_gep_to_flat() #1 { +define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 { store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) ret void } ; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2 -define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) store i32 %val, i32 addrspace(1)* %out ret void } ; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2 -define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst store i32 %val, i32 addrspace(1)* %out ret void } ; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2 -define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out @@ -67,28 +67,28 @@ define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) } ; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2 -define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false) ret void } ; Can't just search the pointer value ; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2 -define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 { +define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 { store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out ret void } ; Can't just search pointer types ; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2 -define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 { +define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 { store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out ret void } ; Cast group to flat, do GEP, cast back to group ; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2 -define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 { +define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 { store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*) ret void } diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll index 0a2130c96add..6ec93c72ec52 100644 --- a/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,14 +1,23 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=CI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=GFX9 %s ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_dispatch_ptr = 0 -; HSA: enable_sgpr_queue_ptr = 1 +; CI: enable_sgpr_queue_ptr = 1 +; GFX9: enable_sgpr_queue_ptr = 0 -; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} -; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} +; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} +; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] + +; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} +; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] + +; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base -; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 @@ -17,7 +26,13 @@ ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] -define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { + +; At most 2 digits. Make sure src_shared_base is not counted as a high +; number SGPR. + +; CI: NumSgprs: {{[0-9][0-9]+}} +; GFX9: NumSgprs: {{[0-9]+}} +define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %stof ret void @@ -26,21 +41,32 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_dispatch_ptr = 0 -; HSA: enable_sgpr_queue_ptr = 1 +; CI: enable_sgpr_queue_ptr = 1 +; GFX9: enable_sgpr_queue_ptr = 0 -; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} -; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} +; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} +; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] + +; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} +; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]] + +; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base -; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 ; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] ; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] -define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { + +; CI: NumSgprs: {{[0-9][0-9]+}} +; GFX9: NumSgprs: {{[0-9]+}} +define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { %stof = addrspacecast i32* %ptr to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %stof ret void @@ -55,7 +81,7 @@ define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] -define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %stof ret void @@ -67,7 +93,7 @@ define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} -define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* %ld = load volatile i32, i32 addrspace(4)* %stof ret void @@ -84,7 +110,7 @@ define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} ; HSA: ds_write_b32 [[CASTPTR]], v[[K]] -define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* store volatile i32 0, i32 addrspace(3)* %ftos ret void @@ -98,10 +124,10 @@ define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 { ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} ; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] -; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] +; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* store volatile i32 0, i32* %ftos ret void @@ -115,7 +141,7 @@ define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 { ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] -define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* store volatile i32 0, i32 addrspace(1)* %ftos ret void @@ -126,21 +152,27 @@ define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 -define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* load volatile i32, i32 addrspace(2)* %ftos ret void } ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: -; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 -; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] +; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 +; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] +; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] + +; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base + ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] -define void @cast_0_group_to_flat_addrspacecast() #0 { +define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)* - store i32 7, i32 addrspace(4)* %cast + store volatile i32 7, i32 addrspace(4)* %cast ret void } @@ -148,9 +180,9 @@ define void @cast_0_group_to_flat_addrspacecast() #0 { ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} ; HSA: ds_write_b32 [[PTR]], [[K]] -define void @cast_0_flat_to_group_addrspacecast() #0 { +define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)* - store i32 7, i32 addrspace(3)* %cast + store volatile i32 7, i32 addrspace(3)* %cast ret void } @@ -159,9 +191,9 @@ define void @cast_0_flat_to_group_addrspacecast() #0 { ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] -define void @cast_neg1_group_to_flat_addrspacecast() #0 { +define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)* - store i32 7, i32 addrspace(4)* %cast + store volatile i32 7, i32 addrspace(4)* %cast ret void } @@ -169,31 +201,34 @@ define void @cast_neg1_group_to_flat_addrspacecast() #0 { ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} ; HSA: ds_write_b32 [[PTR]], [[K]] -define void @cast_neg1_flat_to_group_addrspacecast() #0 { +define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)* - store i32 7, i32 addrspace(3)* %cast + store volatile i32 7, i32 addrspace(3)* %cast ret void } +; FIXME: Shouldn't need to enable queue ptr ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: -; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 -; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] +; CI: enable_sgpr_queue_ptr = 1 +; GFX9: enable_sgpr_queue_ptr = 0 + ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} +; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] -define void @cast_0_private_to_flat_addrspacecast() #0 { +define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { %cast = addrspacecast i32* null to i32 addrspace(4)* - store i32 7, i32 addrspace(4)* %cast + store volatile i32 7, i32 addrspace(4)* %cast ret void } ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: -; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} +; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -define void @cast_0_flat_to_private_addrspacecast() #0 { +define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)* - store i32 7, i32* %cast + store volatile i32 7, i32* %cast ret void } @@ -203,7 +238,7 @@ define void @cast_0_flat_to_private_addrspacecast() #0 { ; HSA-LABEL: {{^}}branch_use_flat_i32: ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} ; HSA: s_endpgm -define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { +define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { entry: %cmp = icmp ne i32 %c, 0 br i1 %cmp, label %local, label %global @@ -218,7 +253,7 @@ global: end: %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] - store i32 %x, i32 addrspace(4)* %fptr, align 4 + store volatile i32 %x, i32 addrspace(4)* %fptr, align 4 ; %val = load i32, i32 addrspace(4)* %fptr, align 4 ; store i32 %val, i32 addrspace(1)* %out, align 4 ret void @@ -226,22 +261,26 @@ end: ; Check for prologue initializing special SGPRs pointing to scratch. ; HSA-LABEL: {{^}}store_flat_scratch: -; HSA-DAG: s_mov_b32 flat_scratch_lo, s9 -; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 -; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 +; CI-DAG: s_mov_b32 flat_scratch_lo, s9 +; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 +; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 + +; GFX9: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9: s_addc_u32 flat_scratch_hi, s7, 0 + ; HSA: flat_store_dword ; HSA: s_barrier ; HSA: flat_load_dword -define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { +define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { %alloca = alloca i32, i32 9, align 4 %x = call i32 @llvm.amdgcn.workitem.id.x() #2 %pptr = getelementptr i32, i32* %alloca, i32 %x %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* - store i32 %x, i32 addrspace(4)* %fptr + store volatile i32 %x, i32 addrspace(4)* %fptr ; Dummy call call void @llvm.amdgcn.s.barrier() #1 - %reload = load i32, i32 addrspace(4)* %fptr, align 4 - store i32 %reload, i32 addrspace(1)* %out, align 4 + %reload = load volatile i32, i32 addrspace(4)* %fptr, align 4 + store volatile i32 %reload, i32 addrspace(1)* %out, align 4 ret void } diff --git a/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/test/CodeGen/AMDGPU/amdgcn.bitcast.ll index 87ef5978ebfc..ef742f56faec 100644 --- a/test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ b/test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -3,24 +3,20 @@ ; This test just checks that the compiler doesn't crash. -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - ; FUNC-LABEL: {{^}}v32i8_to_v8i32: -; SI: s_endpgm -define amdgpu_ps void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { +define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { entry: %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0 %2 = bitcast <32 x i8> %1 to <8 x i32> %3 = extractelement <8 x i32> %2, i32 1 %4 = icmp ne i32 %3, 0 %5 = select i1 %4, float 0.0, float 1.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) - ret void + ret float %5 } ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr: ; SI: s_endpgm -define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)* %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0 @@ -28,28 +24,50 @@ entry: ret void } -define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %load = load float, float addrspace(1)* %in, align 4 - %bc = bitcast float %load to <2 x i16> - store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4 + %fadd32 = fadd float %load, 1.0 + %bc = bitcast float %fadd32 to <2 x i16> + %add.bitcast = add <2 x i16> %bc, + store <2 x i16> %add.bitcast, <2 x i16> addrspace(1)* %out ret void } -define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 - %bc = bitcast <2 x i16> %load to float - store float %bc, float addrspace(1)* %out, align 4 + %add.v2i16 = add <2 x i16> %load, + %bc = bitcast <2 x i16> %add.v2i16 to float + %fadd.bitcast = fadd float %bc, 1.0 + store float %fadd.bitcast, float addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %load = load float, float addrspace(1)* %in, align 4 + %fadd32 = fadd float %load, 1.0 + %bc = bitcast float %fadd32 to <2 x half> + %add.bitcast = fadd <2 x half> %bc, + store <2 x half> %add.bitcast, <2 x half> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind { + %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4 + %add.v2f16 = fadd <2 x half> %load, + %bc = bitcast <2 x half> %add.v2f16 to float + %fadd.bitcast = fadd float %bc, 1.0 + store float %fadd.bitcast, float addrspace(1)* %out ret void } -define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %bc = bitcast <4 x i8> %load to i32 store i32 %bc, i32 addrspace(1)* %out, align 4 ret void } -define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %load to <4 x i8> store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 @@ -58,17 +76,18 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64: ; SI: s_endpgm -define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %add = add <2 x i32> %val, %bc = bitcast <2 x i32> %add to double - store double %bc, double addrspace(1)* %out, align 8 + %fadd.bc = fadd double %bc, 1.0 + store double %fadd.bc, double addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32: ; SI: s_endpgm -define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { %val = load double, double addrspace(1)* %in, align 8 %add = fadd double %val, 4.0 %bc = bitcast double %add to <2 x i32> @@ -77,7 +96,7 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace } ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64: -define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) { +define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -93,7 +112,7 @@ end: } ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64: -define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) { +define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end diff --git a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll index a6d055891d4b..79450b97c218 100644 --- a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll +++ b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll @@ -15,7 +15,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; GCN-NOT: v0 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]] -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) { entry: %0 = alloca [2 x i32] %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll deleted file mode 100644 index 8d8885852afe..000000000000 --- a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -; GCN-LABEL: {{^}}main: -; GCN: s_mov_b32 m0, s0 -; VI-NEXT: s_nop 0 -; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP) -; GCN-NEXT: s_endpgm - -define amdgpu_gs void @main(i32 inreg %a) #0 { - call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a) - ret void -} - -; GCN-LABEL: {{^}}main_halt: -; GCN: s_mov_b32 m0, s0 -; VI-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT) -; GCN-NEXT: s_endpgm - -define void @main_halt(i32 inreg %a) #0 { - call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a) - ret void -} - -; GCN-LABEL: {{^}}legacy: -; GCN: s_mov_b32 m0, s0 -; VI-NEXT: s_nop 0 -; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP) -; GCN-NEXT: s_endpgm - -define amdgpu_gs void @legacy(i32 inreg %a) #0 { - call void @llvm.SI.sendmsg(i32 3, i32 %a) - ret void -} - -declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 -declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0 -declare void @llvm.SI.sendmsg(i32, i32) #0 - -attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll deleted file mode 100644 index 31f9cfca6def..000000000000 --- a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll +++ /dev/null @@ -1,161 +0,0 @@ -;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-LABEL: {{^}}test_interrupt: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT) -define void @test_interrupt() { -body: - call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0); - ret void -} - -; CHECK-LABEL: {{^}}test_gs_emit: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) -define void @test_gs_emit() { -body: - call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0); - ret void -} - -; CHECK-LABEL: {{^}}test_gs_cut: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) -define void @test_gs_cut() { -body: - call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0); - ret void -} - -; CHECK-LABEL: {{^}}test_gs_emit_cut: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) -define void @test_gs_emit_cut() { -body: - call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0) - ret void -} - -; CHECK-LABEL: {{^}}test_gs_done: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) -define void @test_gs_done() { -body: - call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) - ret void -} - - -; CHECK-LABEL: {{^}}test_interrupt_halt: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsghalt sendmsg(MSG_INTERRUPT) -define void @test_interrupt_halt() { -body: - call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0) - ret void -} - -; CHECK-LABEL: {{^}}test_gs_emit_halt: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0) -define void @test_gs_emit_halt() { -body: - call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0) - ret void -} - -; CHECK-LABEL: {{^}}test_gs_cut_halt: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1) -define void @test_gs_cut_halt() { -body: - call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0) - ret void -} - -; CHECK-LABEL: {{^}}test_gs_emit_cut_halt: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) -define void @test_gs_emit_cut_halt() { -body: - call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0) - ret void -} - -; CHECK-LABEL: {{^}}test_gs_done_halt: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP) -define void @test_gs_done_halt() { -body: - call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0) - ret void -} - -; Legacy -; CHECK-LABEL: {{^}}test_legacy_interrupt: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT) -define void @test_legacy_interrupt() { -body: - call void @llvm.SI.sendmsg(i32 1, i32 0) - ret void -} - -; CHECK-LABEL: {{^}}test_legacy_gs_emit: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) -define void @test_legacy_gs_emit() { -body: - call void @llvm.SI.sendmsg(i32 34, i32 0) - ret void -} - -; CHECK-LABEL: {{^}}test_legacy_gs_cut: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) -define void @test_legacy_gs_cut() { -body: - call void @llvm.SI.sendmsg(i32 274, i32 0) - ret void -} - -; CHECK-LABEL: {{^}}test_legacy_gs_emit_cut: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) -define void @test_legacy_gs_emit_cut() { -body: - call void @llvm.SI.sendmsg(i32 562, i32 0) - ret void -} - -; CHECK-LABEL: {{^}}test_legacy_gs_done: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) -define void @test_legacy_gs_done() { -body: - call void @llvm.SI.sendmsg(i32 3, i32 0) - ret void -} - -; Function Attrs: nounwind -declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 -declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0 -declare void @llvm.SI.sendmsg(i32, i32) #0 - -attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll new file mode 100644 index 000000000000..e68ed9cac93f --- /dev/null +++ b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll @@ -0,0 +1,9 @@ +; RUN: opt -mtriple=amdgcn-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s + +; CHECK: NoAlias: i8 addrspace(1)* %p1, i8* %p + +define void @test(i8* %p, i8 addrspace(1)* %p1) { + ret void +} + diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll index d78c75165be2..0e5605961e10 100644 --- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -4,7 +4,7 @@ ; NOOP-LABEL: @noop_fdiv_fpmath( ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0 -define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 { +define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 { %md.25ulp = fdiv float %a, %b, !fpmath !0 store volatile float %md.25ulp, float addrspace(1)* %out ret void @@ -18,7 +18,7 @@ define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 { ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 -define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { +define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out @@ -51,7 +51,7 @@ define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}} ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0 -define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 { %no.md = fdiv float 1.0, %x store volatile float %no.md, float addrspace(1)* %out @@ -89,7 +89,7 @@ define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 { ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1 -define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 { +define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 { %no.md = fdiv <2 x float> %a, %b store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out @@ -120,7 +120,7 @@ define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out -define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { +define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> , %x store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out @@ -158,7 +158,7 @@ define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp -define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { +define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> , %x store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out @@ -186,7 +186,7 @@ define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp -define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { +define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { %x.insert = insertelement <2 x float> %x, float 1.0, i32 0 %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0 @@ -206,7 +206,7 @@ define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* % ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 -define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { +define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll index 13e4192ccd72..95a206e1dd00 100644 --- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll +++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll @@ -6,7 +6,7 @@ ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @add_i3(i3 %a, i3 %b) { @@ -19,7 +19,7 @@ define i3 @add_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = add nsw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @add_nsw_i3(i3 %a, i3 %b) { @@ -32,7 +32,7 @@ define i3 @add_nsw_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = add nuw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @add_nuw_i3(i3 %a, i3 %b) { @@ -58,7 +58,7 @@ define i3 @add_nuw_nsw_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @sub_i3(i3 %a, i3 %b) { @@ -84,7 +84,7 @@ define i3 @sub_nsw_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @sub_nuw_i3(i3 %a, i3 %b) { @@ -110,7 +110,7 @@ define i3 @sub_nuw_nsw_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @mul_i3(i3 %a, i3 %b) { @@ -123,7 +123,7 @@ define i3 @mul_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @mul_nsw_i3(i3 %a, i3 %b) { @@ -136,7 +136,7 @@ define i3 @mul_nsw_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @mul_nuw_i3(i3 %a, i3 %b) { @@ -188,7 +188,7 @@ define i3 @srem_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @shl_i3(i3 %a, i3 %b) { @@ -201,7 +201,7 @@ define i3 @shl_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @shl_nsw_i3(i3 %a, i3 %b) { @@ -214,7 +214,7 @@ define i3 @shl_nsw_i3(i3 %a, i3 %b) { ; SI-NEXT: ret i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 ; VI-NEXT: ret i3 %[[R_3]] define i3 @shl_nuw_i3(i3 %a, i3 %b) { @@ -525,7 +525,7 @@ define i3 @bitreverse_i3(i3 %a) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @add_i16(i16 %a, i16 %b) { @@ -559,7 +559,7 @@ define i16 @constant_add_nuw_i16() { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = add nsw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @add_nsw_i16(i16 %a, i16 %b) { @@ -572,7 +572,7 @@ define i16 @add_nsw_i16(i16 %a, i16 %b) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = add nuw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @add_nuw_i16(i16 %a, i16 %b) { @@ -598,7 +598,7 @@ define i16 @add_nuw_nsw_i16(i16 %a, i16 %b) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @sub_i16(i16 %a, i16 %b) { @@ -624,7 +624,7 @@ define i16 @sub_nsw_i16(i16 %a, i16 %b) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @sub_nuw_i16(i16 %a, i16 %b) { @@ -650,7 +650,7 @@ define i16 @sub_nuw_nsw_i16(i16 %a, i16 %b) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @mul_i16(i16 %a, i16 %b) { @@ -663,7 +663,7 @@ define i16 @mul_i16(i16 %a, i16 %b) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @mul_nsw_i16(i16 %a, i16 %b) { @@ -676,7 +676,7 @@ define i16 @mul_nsw_i16(i16 %a, i16 %b) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @mul_nuw_i16(i16 %a, i16 %b) { @@ -728,7 +728,7 @@ define i16 @srem_i16(i16 %a, i16 %b) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @shl_i16(i16 %a, i16 %b) { @@ -741,7 +741,7 @@ define i16 @shl_i16(i16 %a, i16 %b) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @shl_nsw_i16(i16 %a, i16 %b) { @@ -754,7 +754,7 @@ define i16 @shl_nsw_i16(i16 %a, i16 %b) { ; SI-NEXT: ret i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 -; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw i32 %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 ; VI-NEXT: ret i16 %[[R_16]] define i16 @shl_nuw_i16(i16 %a, i16 %b) { @@ -1072,7 +1072,7 @@ define i16 @bitreverse_i16(i16 %a) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = add <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @add_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1085,7 +1085,7 @@ define <3 x i15> @add_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = add nsw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1098,7 +1098,7 @@ define <3 x i15> @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = add nuw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1124,7 +1124,7 @@ define <3 x i15> @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = sub <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @sub_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1150,7 +1150,7 @@ define <3 x i15> @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1176,7 +1176,7 @@ define <3 x i15> @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = mul <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @mul_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1189,7 +1189,7 @@ define <3 x i15> @mul_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1202,7 +1202,7 @@ define <3 x i15> @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1254,7 +1254,7 @@ define <3 x i15> @srem_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = shl <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @shl_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1267,7 +1267,7 @@ define <3 x i15> @shl_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1280,7 +1280,7 @@ define <3 x i15> @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { ; SI-NEXT: ret <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> ; VI-NEXT: ret <3 x i15> %[[R_15]] define <3 x i15> @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { @@ -1591,7 +1591,7 @@ define <3 x i15> @bitreverse_3xi15(<3 x i15> %a) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = add <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1604,7 +1604,7 @@ define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = add nsw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1617,7 +1617,7 @@ define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = add nuw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1643,7 +1643,7 @@ define <3 x i16> @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = sub <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @sub_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1669,7 +1669,7 @@ define <3 x i16> @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1695,7 +1695,7 @@ define <3 x i16> @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = mul <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1708,7 +1708,7 @@ define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1721,7 +1721,7 @@ define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1773,7 +1773,7 @@ define <3 x i16> @srem_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = shl <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1786,7 +1786,7 @@ define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { @@ -1799,7 +1799,7 @@ define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { ; SI-NEXT: ret <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> -; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw <3 x i32> %[[A_32]], %[[B_32]] +; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> ; VI-NEXT: ret <3 x i16> %[[R_16]] define <3 x i16> @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { diff --git a/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll index dd16907b748c..0ba8836b20dc 100644 --- a/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll +++ b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll @@ -13,9 +13,10 @@ define amdgpu_cs float @shader_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w ; GCN-LABEL: {{^}}kernel_cc: ; GCN: s_endpgm -define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { +define amdgpu_kernel void @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { %vi = bitcast float %v to i32 %x = add i32 %vi, %w %xf = bitcast i32 %x to float - ret float %xf + store float %xf, float addrspace(1)* undef + ret void } diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 1511e1343808..1f4b1eaa209a 100644 --- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -1,9 +1,9 @@ -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s @@ -27,8 +27,6 @@ ; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120 ; HSA-PROMOTE: .end_amd_kernel_code_t -; FIXME: These should be merged -; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x1 ; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2 ; SI-PROMOTE: ds_write_b32 @@ -58,9 +56,9 @@ ; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16 -; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !1 -; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !1 -; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !1 +; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !2 +; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !2 ; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]] ; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]] @@ -77,10 +75,10 @@ ; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0 ; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !0 -; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0 -; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0 -; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0 -define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !1 +; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !1 +; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1 +define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -102,7 +100,7 @@ entry: ; OPT-LABEL: @high_alignment( ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}} -define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [8 x i32], align 16 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -127,7 +125,7 @@ entry: ; OPT: alloca [5 x i32] ; SI-NOT: ds_write -define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -162,7 +160,7 @@ entry: ; SI-NOT: v_movrel %struct.point = type { i32, i32 } -define void @multiple_structs(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 { entry: %a = alloca %struct.point %b = alloca %struct.point @@ -191,7 +189,7 @@ entry: ; R600-NOT: MOVA_INT ; SI-NOT: v_movrel -define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: %prv_array_const = alloca [2 x i32] %prv_array = alloca [2 x i32] @@ -227,11 +225,15 @@ for.end: ; R600: MOVA_INT -; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x68,0xe0 -; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:2 ; encoding: [0x02,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:6 ; encoding: [0x06,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x68,0xe0 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort. -; SI-PROMOTE: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} -define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { +; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} + +; SI-PROMOTE: s_load_dword [[IDX:s[0-9]+]] +; SI-PROMOTE: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16 +; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16 +define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i16] %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0 @@ -249,12 +251,12 @@ entry: ; R600: MOVA_INT -; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: -; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x60,0xe0 -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding: [0x01,0x00,0x60,0xe0 -define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0 +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 +define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i8] %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0 @@ -277,7 +279,7 @@ entry: ; ; A total of 5 bytes should be allocated and used. ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; -define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1 %1 = alloca [2 x i8], align 1 @@ -301,7 +303,7 @@ entry: ret void } -define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i8]] %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 @@ -315,7 +317,7 @@ entry: ret void } -define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 @@ -328,7 +330,7 @@ entry: ret void } -define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i64]] %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 @@ -343,7 +345,7 @@ entry: %struct.pair32 = type { i32, i32 } -define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x %struct.pair32]] %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 @@ -356,7 +358,7 @@ entry: ret void } -define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x %struct.pair32] %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 @@ -369,7 +371,7 @@ entry: ret void } -define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 @@ -390,7 +392,7 @@ entry: ; SI-NOT: ds_write ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ; -define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32] %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a store i32 5, i32* %tmp0 @@ -406,7 +408,7 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; OPT-LABEL: @pointer_typed_alloca( ; OPT: getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}} ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4 -define void @pointer_typed_alloca(i32 addrspace(1)* %A) { +define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) { entry: %A.addr = alloca i32 addrspace(1)*, align 4 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 @@ -458,7 +460,7 @@ entry: ; SI: buffer_load_dword ; SI: buffer_load_dword -define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) { %alloca = alloca [2 x <16 x i32>] %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a %tmp5 = load <16 x i32>, <16 x i32>* %tmp0 @@ -502,7 +504,7 @@ define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) { ; SI: buffer_load_dword ; SI: buffer_load_dword -define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) { %alloca = alloca [2 x <16 x float>] %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a %tmp5 = load <16 x float>, <16 x float>* %tmp0 @@ -518,7 +520,7 @@ define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) { ; SI: buffer_load_dword ; SI: buffer_load_dword -define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) { %alloca = alloca [16 x <2 x float>] %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a %tmp5 = load <2 x float>, <2 x float>* %tmp0 @@ -529,7 +531,7 @@ define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) { ; OPT-LABEL: @direct_alloca_read_0xi32( ; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)* ; OPT: load [0 x i32], [0 x i32] addrspace(3)* -define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [0 x i32] store [0 x i32] [], [0 x i32]* %tmp @@ -541,7 +543,7 @@ entry: ; OPT-LABEL: @direct_alloca_read_1xi32( ; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)* ; OPT: load [1 x i32], [1 x i32] addrspace(3)* -define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [1 x i32] store [1 x i32] [i32 0], [1 x i32]* %tmp @@ -553,6 +555,8 @@ entry: attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" } ; HSAOPT: !0 = !{} -; HSAOPT: !1 = !{i32 0, i32 2048} +; HSAOPT: !1 = !{i32 0, i32 257} +; HSAOPT: !2 = !{i32 0, i32 256} -; NOHSAOPT: !0 = !{i32 0, i32 2048} +; NOHSAOPT: !0 = !{i32 0, i32 257} +; NOHSAOPT: !1 = !{i32 0, i32 256} diff --git a/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll index e515ca00d184..187320805c11 100644 --- a/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll +++ b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll @@ -12,7 +12,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X -define void @ngroups_x (i32 addrspace(1)* %out) { +define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -27,7 +27,7 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y -define void @ngroups_y (i32 addrspace(1)* %out) { +define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -42,7 +42,7 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z -define void @ngroups_z (i32 addrspace(1)* %out) { +define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -57,7 +57,7 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W -define void @global_size_x (i32 addrspace(1)* %out) { +define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -72,7 +72,7 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X -define void @global_size_y (i32 addrspace(1)* %out) { +define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -87,7 +87,7 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y -define void @global_size_z (i32 addrspace(1)* %out) { +define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -102,7 +102,7 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z -define void @local_size_x (i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -117,7 +117,7 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W -define void @local_size_y (i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -132,7 +132,7 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X -define void @local_size_z (i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -153,7 +153,7 @@ entry: ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @tgid_x_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_x_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -165,7 +165,7 @@ entry: ; GCN-NOHSA: buffer_store_dword [[VVAL]] ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 -define void @tgid_y_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_y_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -181,7 +181,7 @@ entry: ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @tgid_z_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_z_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -194,7 +194,7 @@ entry: ; FUNC-LABEL: {{^}}tidig_x_legacy: ; GCN-NOHSA: buffer_store_dword v0 -define void @tidig_x_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_x_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -208,7 +208,7 @@ entry: ; FUNC-LABEL: {{^}}tidig_y_legacy: ; GCN-NOHSA: buffer_store_dword v1 -define void @tidig_y_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_y_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -221,7 +221,7 @@ entry: ; FUNC-LABEL: {{^}}tidig_z_legacy: ; GCN-NOHSA: buffer_store_dword v2 -define void @tidig_z_legacy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_z_legacy(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 store i32 %0, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll index dde5f8c21769..2aec03aff8a3 100644 --- a/test/CodeGen/AMDGPU/and-gcn.ll +++ b/test/CodeGen/AMDGPU/and-gcn.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}v_and_i64_br: ; SI: v_and_b32 ; SI: v_and_b32 -define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { entry: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll index 5d9dcf64debf..c356f8b87cfc 100644 --- a/test/CodeGen/AMDGPU/and.ll +++ b/test/CodeGen/AMDGPU/and.ll @@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() #0 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -31,7 +31,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -42,7 +42,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ; FUNC-LABEL: {{^}}s_and_i32: ; SI: s_and_b32 -define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %and = and i32 %a, %b store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -50,7 +50,7 @@ define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { ; FUNC-LABEL: {{^}}s_and_constant_i32: ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687 -define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { %and = and i32 %a, 1234567 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -66,7 +66,7 @@ define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] ; SI: buffer_store_dword [[VK]] -define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) { %and = and i32 %a, 1234567 ; Just to stop future replacement of copy to vgpr + store with VALU op. @@ -83,7 +83,7 @@ define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 ; SI: s_add_i32 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]] ; SI: buffer_store_dword [[VK]] -define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) { %and = and i32 %a, 1234567 %foo = add i32 %and, 1234567 %bar = add i32 %foo, %b @@ -93,7 +93,7 @@ define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr: ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -109,7 +109,7 @@ define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; SI-DAG: s_load_dword [[SA:s[0-9]+]] ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]] ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]] -define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -123,7 +123,7 @@ define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1 ; SI-DAG: s_load_dword [[SA:s[0-9]+]] ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]] ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]] -define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) { +define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -135,7 +135,7 @@ define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; FUNC-LABEL: {{^}}v_and_constant_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}} -define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { %a = load i32, i32 addrspace(1)* %aptr, align 4 %and = and i32 %a, 1234567 store i32 %and, i32 addrspace(1)* %out, align 4 @@ -144,7 +144,7 @@ define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}} -define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { %a = load i32, i32 addrspace(1)* %aptr, align 4 %and = and i32 %a, 64 store i32 %and, i32 addrspace(1)* %out, align 4 @@ -153,7 +153,7 @@ define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* % ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}} -define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { %a = load i32, i32 addrspace(1)* %aptr, align 4 %and = and i32 %a, -16 store i32 %and, i32 addrspace(1)* %out, align 4 @@ -162,7 +162,7 @@ define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1 ; FUNC-LABEL: {{^}}s_and_i64 ; SI: s_and_b64 -define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %and = and i64 %a, %b store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -171,7 +171,7 @@ define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; FIXME: Should use SGPRs ; FUNC-LABEL: {{^}}s_and_i1: ; SI: v_and_b32 -define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { +define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { %and = and i1 %a, %b store i1 %and, i1 addrspace(1)* %out ret void @@ -181,7 +181,7 @@ define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}} ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}} ; SI: buffer_store_dwordx2 -define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { %and = and i64 %a, 549756338176 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -191,7 +191,7 @@ define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}} ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}} ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}} -define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %and0 = and i64 %a, 549756338176 %and1 = and i64 %b, 549756338176 store volatile i64 %and0, i64 addrspace(1)* %out @@ -205,7 +205,7 @@ define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}} ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) { %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -223,7 +223,7 @@ define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) { ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) { +define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) { %shl.a = shl i64 %a, 1 %shl.b = shl i64 %b, 1 %and0 = and i64 %shl.a, 62 @@ -238,7 +238,7 @@ define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 ; FUNC-LABEL: {{^}}v_and_i64: ; SI: v_and_b32 ; SI: v_and_b32 -define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 %and = and i64 %a, %b @@ -250,7 +250,7 @@ define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}} ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}} ; SI: buffer_store_dwordx2 -define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, 1231231234567 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -268,7 +268,7 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]] ; SI: buffer_store_dwordx2 ; SI: buffer_store_dwordx2 -define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load volatile i64, i64 addrspace(1)* %aptr %b = load volatile i64, i64 addrspace(1)* %aptr %and0 = and i64 %a, 1231231234567 @@ -288,7 +288,7 @@ define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace( ; SI-NOT: and ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]] -define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load volatile i64, i64 addrspace(1)* %aptr %b = load volatile i64, i64 addrspace(1)* %aptr %and0 = and i64 %a, 63 @@ -304,7 +304,7 @@ define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspac ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]] ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -317,7 +317,7 @@ define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}} ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, 64 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -331,7 +331,7 @@ define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %apt ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]] ; SI-NOT: and ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} -define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, -8 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -344,7 +344,7 @@ define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64 ; SI-NOT: and ; SI: buffer_store_dword -define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 64 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -358,7 +358,7 @@ define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; SI-NOT: and ; SI: s_add_u32 ; SI-NEXT: s_addc_u32 -define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) { +define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) { %shl = shl i64 %a, 1 %and = and i64 %shl, 64 %add = add i64 %and, %b @@ -372,7 +372,7 @@ define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrsp ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 1 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -387,7 +387,7 @@ define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4607182418800017408 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -402,7 +402,7 @@ define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13830554455654793216 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -417,7 +417,7 @@ define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace( ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4602678819172646912 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -432,7 +432,7 @@ define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13826050856027422720 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -445,7 +445,7 @@ define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace( ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4611686018427387904 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -458,7 +458,7 @@ define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13835058055282163712 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -473,7 +473,7 @@ define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace( ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4616189618054758400 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -488,7 +488,7 @@ define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13839561654909534208 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -505,7 +505,7 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace( ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 1082130432 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -518,7 +518,7 @@ define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace( ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, -1065353216 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -531,7 +531,7 @@ define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 4647714815446351872 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -544,7 +544,7 @@ define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrs ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0 ; SI-NOT: and ; SI: buffer_store_dwordx2 -define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %and = and i64 %a, 13871086852301127680 store i64 %and, i64 addrspace(1)* %out, align 8 ret void diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 084a6933da26..e2620ce353c6 100644 --- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -11,22 +11,22 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 -; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { -define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { -define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.y() store volatile i32 %val0, i32 addrspace(1)* %ptr %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -34,8 +34,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { -define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -43,15 +43,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { -define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { +define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.z() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { -define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { +define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -59,8 +59,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { -define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { +define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -68,8 +68,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { -define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { +define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() %val2 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -79,29 +79,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workitem.id.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { -define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { +define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workitem.id.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { -define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { +define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.amdgcn.workitem.id.z() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.x() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -109,8 +109,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { -define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { +define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -118,8 +118,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { -define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { +define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() %val2 = call i32 @llvm.amdgcn.workitem.id.z() @@ -129,8 +129,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { -define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { +define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() %val2 = call i32 @llvm.amdgcn.workitem.id.z() @@ -146,8 +146,8 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 { -define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 { +define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* %val = load i32, i32 addrspace(2)* %bc @@ -155,8 +155,8 @@ define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 { -define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 { +define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* %val = load i32, i32 addrspace(2)* %bc @@ -164,58 +164,58 @@ define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 { -define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 { +define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %stof ret void } -; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 { -define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 { +define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 { %stof = addrspacecast i32* %ptr to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %stof ret void } -; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* store volatile i32 0, i32 addrspace(3)* %ftos ret void } -; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* store volatile i32 0, i32* %ftos ret void } ; No-op addrspacecast should not use queue ptr -; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { -define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %stof ret void } -; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { -define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { +define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* %ld = load volatile i32, i32 addrspace(4)* %stof ret void } -; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* store volatile i32 0, i32 addrspace(1)* %ftos ret void } -; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { -define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* %ld = load volatile i32, i32 addrspace(2)* %ftos ret void diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/test/CodeGen/AMDGPU/annotate-kernel-features.ll index a4e7bb67d507..09750da4cb8c 100644 --- a/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -12,22 +12,22 @@ declare i32 @llvm.r600.read.local.size.x() #0 declare i32 @llvm.r600.read.local.size.y() #0 declare i32 @llvm.r600.read.local.size.z() #0 -; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tgid.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { -define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tgid.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { -define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.y() store volatile i32 %val0, i32 addrspace(1)* %ptr %val1 = call i32 @llvm.r600.read.tgid.y() @@ -35,8 +35,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { ret void } -; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { -define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { +define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -44,15 +44,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { ret void } -; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { -define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { +define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tgid.z() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { -define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { +define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.z() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -60,8 +60,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { ret void } -; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { -define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { +define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.y() %val1 = call i32 @llvm.r600.read.tgid.z() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -69,8 +69,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { ret void } -; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { -define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { +define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.y() %val2 = call i32 @llvm.r600.read.tgid.z() @@ -80,29 +80,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { ret void } -; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tidig.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { -define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { +define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tidig.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { -define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { +define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.tidig.z() store i32 %val, i32 addrspace(1)* %ptr ret void } -; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { -define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tgid.x() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -110,8 +110,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { ret void } -; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { -define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { +define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tidig.y() %val1 = call i32 @llvm.r600.read.tgid.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -119,8 +119,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { ret void } -; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { -define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { +define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tidig.y() %val2 = call i32 @llvm.r600.read.tidig.z() @@ -130,8 +130,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { ret void } -; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { -define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { +; ALL: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { +define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tidig.y() %val2 = call i32 @llvm.r600.read.tidig.z() @@ -147,25 +147,25 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { ret void } -; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 { -; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { -define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.local.size.x() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 { -; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { -define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.local.size.y() store i32 %val, i32 addrspace(1)* %ptr ret void } -; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 { -; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { -define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { +; HSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { +define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { %val = call i32 @llvm.r600.read.local.size.z() store i32 %val, i32 addrspace(1)* %ptr ret void diff --git a/test/CodeGen/AMDGPU/anonymous-gv.ll b/test/CodeGen/AMDGPU/anonymous-gv.ll index f37b0f3382f4..04fbe2ae1f94 100644 --- a/test/CodeGen/AMDGPU/anonymous-gv.ll +++ b/test/CodeGen/AMDGPU/anonymous-gv.ll @@ -6,13 +6,13 @@ ; CHECK-LABEL: {{^}}test: ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1 ; CHECK: s_endpgm -define void @test() { +define amdgpu_kernel void @test() { store i32 1, i32 addrspace(1)* @0 ret void } ; CHECK-LABEL: {{^}}__unnamed_2: ; CHECK: s_endpgm -define void @1() { +define amdgpu_kernel void @1() { ret void } diff --git a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll new file mode 100644 index 000000000000..c61c23222bc7 --- /dev/null +++ b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -0,0 +1,58 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32: +; GCN: {{buffer|flat}}_load_dwordx4 +; GCN-DAG: {{buffer|flat}}_load_dwordx4 +; GCN-DAG: {{buffer|flat}}_load_dword + +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte + +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte + +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte + +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte +; GCN: {{buffer|flat}}_store_byte +define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(<8 x i8> addrspace(1)* nocapture readonly %arg, <16 x i8> addrspace(1)* %arg1) local_unnamed_addr #0 { +bb: + %tmp = bitcast <8 x i8> addrspace(1)* %arg to <16 x i8> addrspace(1)* + %tmp2 = load <16 x i8>, <16 x i8> addrspace(1)* %tmp, align 16 + %tmp3 = extractelement <16 x i8> %tmp2, i64 4 + %tmp6 = extractelement <16 x i8> %tmp2, i64 11 + %tmp10 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %arg, i64 2 + %tmp11 = bitcast <8 x i8> addrspace(1)* %tmp10 to <16 x i8> addrspace(1)* + %tmp12 = load <16 x i8>, <16 x i8> addrspace(1)* %tmp11, align 16 + %tmp13 = extractelement <16 x i8> %tmp12, i64 7 + %tmp17 = extractelement <16 x i8> %tmp12, i64 12 + %tmp21 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %arg, i64 4 + %tmp22 = bitcast <8 x i8> addrspace(1)* %tmp21 to <16 x i8> addrspace(1)* + %tmp23 = load <16 x i8>, <16 x i8> addrspace(1)* %tmp22, align 16 + %tmp24 = extractelement <16 x i8> %tmp23, i64 3 + %tmp1 = insertelement <16 x i8> undef, i8 %tmp3, i32 2 + %tmp4 = insertelement <16 x i8> %tmp1, i8 0, i32 3 + %tmp5 = insertelement <16 x i8> %tmp4, i8 0, i32 4 + %tmp7 = insertelement <16 x i8> %tmp5, i8 %tmp6, i32 5 + %tmp8 = insertelement <16 x i8> %tmp7, i8 0, i32 6 + %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp13, i32 7 + %tmp14 = insertelement <16 x i8> %tmp9, i8 0, i32 8 + %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp17, i32 9 + %tmp16 = insertelement <16 x i8> %tmp15, i8 0, i32 10 + %tmp18 = insertelement <16 x i8> %tmp16, i8 0, i32 11 + %tmp19 = insertelement <16 x i8> %tmp18, i8 %tmp24, i32 12 + store <16 x i8> %tmp19, <16 x i8> addrspace(1)* %arg1, align 1 + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/anyext.ll b/test/CodeGen/AMDGPU/anyext.ll index 87b4c86427c8..3f220c408412 100644 --- a/test/CodeGen/AMDGPU/anyext.ll +++ b/test/CodeGen/AMDGPU/anyext.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; GCN-LABEL: {{^}}anyext_i1_i32: ; GCN: v_cndmask_b32_e64 -define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { entry: %tmp = icmp eq i32 %cond, 0 %tmp1 = zext i1 %tmp to i8 @@ -22,7 +22,7 @@ entry: ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]] ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]] ; VI: buffer_store_dword [[AND]] -define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) { +define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) { entry: %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll index f190bd0cb01e..daa3442097cf 100644 --- a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -12,11 +12,7 @@ declare void @llvm.amdgcn.s.barrier() #2 ; SI-LABEL: {{^}}test_private_array_ptr_calc: -; FIXME: We end up with zero argument for ADD, because -; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index -; with the appropriate offset. We should fold this into the store. - -; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}} +; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}} ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64 ; SI-ALLOCA: s_barrier ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64 @@ -28,7 +24,7 @@ declare void @llvm.amdgcn.s.barrier() #2 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64 ; SI-PROMOTE: ds_write_b32 [[PTRREG]] -define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 { +define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 { %alloca = alloca [16 x i32], align 16 %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0); %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll index b914edf2928e..ddeffc10a089 100644 --- a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll +++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_hi_i32 ; SI: s_endpgm -define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { +define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0 diff --git a/test/CodeGen/AMDGPU/ashr.v2i16.ll b/test/CodeGen/AMDGPU/ashr.v2i16.ll new file mode 100644 index 000000000000..96a5e3b23758 --- /dev/null +++ b/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -0,0 +1,161 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s + +; GCN-LABEL: {{^}}s_ashr_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] + +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +; CI: v_ashrrev_i32_e32 +; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_or_b32_e32 +define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { + %result = ashr <2 x i16> %lhs, %rhs + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_ashr_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] + +; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], [[RHS]] +; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]] +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr + %result = ashr <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_v_s_v2i16: +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define amdgpu_kernel void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = ashr <2 x i16> %vgpr, %sgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_s_v_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define amdgpu_kernel void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = ashr <2 x i16> %sgpr, %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_imm_v_v2i16: +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4 +define amdgpu_kernel void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = ashr <2 x i16> , %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_v_imm_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]] +define amdgpu_kernel void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = ashr <2 x i16> %vgpr, + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_ashr_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GCN: {{buffer|flat}}_store_dwordx2 +define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr + %result = ashr <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_v_imm_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define amdgpu_kernel void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %result = ashr <4 x i16> %vgpr, + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll index 25eae0b41ae4..4f9526ddab55 100644 --- a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -12,7 +12,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic %result = extractvalue { i32, i1 } %pair, 0 @@ -33,7 +33,7 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic %result = extractvalue { i64, i1 } %pair, 0 @@ -45,7 +45,7 @@ define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrs ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -65,7 +65,7 @@ define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i3 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic %result = extractvalue { i32, i1 } %pair, 0 @@ -84,7 +84,7 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] ; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind { +define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic %result = extractvalue { i64, i1 } %pair, 0 diff --git a/test/CodeGen/AMDGPU/atomic_load_add.ll b/test/CodeGen/AMDGPU/atomic_load_add.ll index 4b014e09b630..e0fe6641fa11 100644 --- a/test/CodeGen/AMDGPU/atomic_load_add.ll +++ b/test/CodeGen/AMDGPU/atomic_load_add.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}atomic_add_local: ; R600: LDS_ADD * ; SI: ds_add_u32 -define void @atomic_add_local(i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) { %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst ret void } @@ -13,7 +13,7 @@ define void @atomic_add_local(i32 addrspace(3)* %local) { ; FUNC-LABEL: {{^}}atomic_add_local_const_offset: ; R600: LDS_ADD * ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst ret void @@ -22,7 +22,7 @@ define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { ; FUNC-LABEL: {{^}}atomic_add_ret_local: ; R600: LDS_ADD_RET * ; SI: ds_add_rtn_u32 -define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out ret void @@ -31,7 +31,7 @@ define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset: ; R600: LDS_ADD_RET * ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 -define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/atomic_load_sub.ll b/test/CodeGen/AMDGPU/atomic_load_sub.ll index c6e5b1136d7c..a0275893919a 100644 --- a/test/CodeGen/AMDGPU/atomic_load_sub.ll +++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}atomic_sub_local: ; R600: LDS_SUB * ; SI: ds_sub_u32 -define void @atomic_sub_local(i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) { %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst ret void } @@ -13,7 +13,7 @@ define void @atomic_sub_local(i32 addrspace(3)* %local) { ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset: ; R600: LDS_SUB * ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst ret void @@ -22,7 +22,7 @@ define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { ; FUNC-LABEL: {{^}}atomic_sub_ret_local: ; R600: LDS_SUB_RET * ; SI: ds_sub_rtn_u32 -define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out ret void @@ -31,7 +31,7 @@ define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset: ; R600: LDS_SUB_RET * ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 -define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { +define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index cab377feacb2..63a6f6a8d32c 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -5,7 +5,7 @@ ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @min_64_max_64() #0 { +define amdgpu_kernel void @min_64_max_64() #0 { entry: ret void } @@ -16,7 +16,7 @@ attributes #0 = {"amdgpu-flat-work-group-size"="64,64"} ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @min_64_max_128() #1 { +define amdgpu_kernel void @min_64_max_128() #1 { entry: ret void } @@ -27,7 +27,7 @@ attributes #1 = {"amdgpu-flat-work-group-size"="64,128"} ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @min_128_max_128() #2 { +define amdgpu_kernel void @min_128_max_128() #2 { entry: ret void } @@ -39,7 +39,7 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK: NumSGPRsForWavesPerEU: 13 ; CHECK: NumVGPRsForWavesPerEU: 32 @var = addrspace(1) global float 0.0 -define void @min_1024_max_2048() #3 { +define amdgpu_kernel void @min_1024_max_2048() #3 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var %val2 = load volatile float, float addrspace(1)* @var diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index e4f6e72e6977..ac2f7b4a4a4b 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -4,25 +4,22 @@ ; If spilling to smem, additional registers are used for the resource ; descriptor. -; ALL-LABEL: {{^}}max_12_sgprs: +; ALL-LABEL: {{^}}max_9_sgprs: -; FIXME: Should be ablo to skip this copying of the private segment -; buffer because all the SGPR spills are to VGPRs. - -; ALL: s_mov_b64 s[10:11], s[2:3] -; ALL: s_mov_b64 s[8:9], s[0:1] ; ALL: SGPRBlocks: 1 -; ALL: NumSGPRsForWavesPerEU: 14 -define void @max_12_sgprs(i32 addrspace(1)* %out1, +; ALL: NumSGPRsForWavesPerEU: 9 +define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, - i32 %one, i32 %two, i32 %three, i32 %four) #0 { + i32 addrspace(1)* %out5, + i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 { store i32 %one, i32 addrspace(1)* %out1 store i32 %two, i32 addrspace(1)* %out2 store i32 %three, i32 addrspace(1)* %out3 store i32 %four, i32 addrspace(1)* %out4 + store i32 %five, i32 addrspace(1)* %out5 ret void } @@ -52,23 +49,26 @@ define void @max_12_sgprs(i32 addrspace(1)* %out1, ; TOSMEM: SGPRBlocks: 1 ; TOSMEM: NumSGPRsForWavesPerEU: 16 -define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, +define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, i32 %one, i32 %two, i32 %three, i32 %four) #2 { - store volatile i32 0, i32* undef %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() - store volatile i32 %x.0, i32 addrspace(1)* undef %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() - store volatile i32 %x.0, i32 addrspace(1)* undef %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() - store volatile i32 %x.0, i32 addrspace(1)* undef %x.3 = call i64 @llvm.amdgcn.dispatch.id() - store volatile i64 %x.3, i64 addrspace(1)* undef %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() - store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() + store volatile i32 0, i32* undef + br label %stores + +stores: + store volatile i32 %x.0, i32 addrspace(1)* undef + store volatile i32 %x.0, i32 addrspace(1)* undef + store volatile i32 %x.0, i32 addrspace(1)* undef + store volatile i64 %x.3, i64 addrspace(1)* undef + store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef store i32 %one, i32 addrspace(1)* %out1 @@ -90,7 +90,7 @@ define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, ; XALL: SGPRBlocks: 2 ; XALL: NumSGPRsForWavesPerEU: 18 -;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, +;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, ; i32 addrspace(1)* %out2, ; i32 addrspace(1)* %out3, ; i32 addrspace(1)* %out4, diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll b/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll index 97feb7276b7d..979665ff0a80 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: {{^}}max_20_vgprs: ; CHECK: VGPRBlocks: 4 ; CHECK: NumVGPRsForWavesPerEU: 20 -define void @max_20_vgprs() #1 { +define amdgpu_kernel void @max_20_vgprs() #1 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var %val2 = load volatile float, float addrspace(1)* @var diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 4f4efccc2260..3dda73bc336e 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -4,9 +4,9 @@ ; CHECK-LABEL: {{^}}empty_exactly_1: ; CHECK: SGPRBlocks: 12 ; CHECK: VGPRBlocks: 32 -; CHECK: NumSGPRsForWavesPerEU: 97 +; CHECK: NumSGPRsForWavesPerEU: 102 ; CHECK: NumVGPRsForWavesPerEU: 129 -define void @empty_exactly_1() #0 { +define amdgpu_kernel void @empty_exactly_1() #0 { entry: ret void } @@ -16,9 +16,9 @@ attributes #0 = {"amdgpu-waves-per-eu"="1,1"} ; CHECK-LABEL: {{^}}empty_exactly_5: ; CHECK: SGPRBlocks: 12 ; CHECK: VGPRBlocks: 10 -; CHECK: NumSGPRsForWavesPerEU: 97 +; CHECK: NumSGPRsForWavesPerEU: 102 ; CHECK: NumVGPRsForWavesPerEU: 41 -define void @empty_exactly_5() #1 { +define amdgpu_kernel void @empty_exactly_5() #1 { entry: ret void } @@ -30,7 +30,7 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"} ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_exactly_10() #2 { +define amdgpu_kernel void @empty_exactly_10() #2 { entry: ret void } @@ -42,7 +42,7 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"} ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_at_least_1() #3 { +define amdgpu_kernel void @empty_at_least_1() #3 { entry: ret void } @@ -54,7 +54,7 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"} ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_at_least_5() #4 { +define amdgpu_kernel void @empty_at_least_5() #4 { entry: ret void } @@ -66,7 +66,7 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"} ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_at_least_10() #5 { +define amdgpu_kernel void @empty_at_least_10() #5 { entry: ret void } @@ -78,9 +78,9 @@ attributes #5 = {"amdgpu-waves-per-eu"="10"} ; CHECK-LABEL: {{^}}empty_at_most_5: ; CHECK: SGPRBlocks: 12 ; CHECK: VGPRBlocks: 10 -; CHECK: NumSGPRsForWavesPerEU: 97 +; CHECK: NumSGPRsForWavesPerEU: 102 ; CHECK: NumVGPRsForWavesPerEU: 41 -define void @empty_at_most_5() #6 { +define amdgpu_kernel void @empty_at_most_5() #6 { entry: ret void } @@ -92,7 +92,7 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5"} ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_at_most_10() #7 { +define amdgpu_kernel void @empty_at_most_10() #7 { entry: ret void } @@ -106,7 +106,7 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"} ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 ; CHECK: NumVGPRsForWavesPerEU: 1 -define void @empty_between_5_and_10() #8 { +define amdgpu_kernel void @empty_between_5_and_10() #8 { entry: ret void } @@ -120,7 +120,7 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; CHECK: VGPRBlocks: 5 ; CHECK: NumSGPRsForWavesPerEU: 13 ; CHECK: NumVGPRsForWavesPerEU: 24 -define void @exactly_10() #9 { +define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var %val2 = load volatile float, float addrspace(1)* @var diff --git a/test/CodeGen/AMDGPU/attr-unparseable.ll b/test/CodeGen/AMDGPU/attr-unparseable.ll index 0282bc34c0ee..17adb89900cd 100644 --- a/test/CodeGen/AMDGPU/attr-unparseable.ll +++ b/test/CodeGen/AMDGPU/attr-unparseable.ll @@ -1,56 +1,56 @@ ; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s ; CHECK: can't parse integer attribute amdgpu-num-sgpr -define void @unparseable_single_0() #0 { +define amdgpu_kernel void @unparseable_single_0() #0 { entry: ret void } attributes #0 = {"amdgpu-num-sgpr"} ; CHECK: can't parse integer attribute amdgpu-num-sgpr -define void @unparseable_single_1() #1 { +define amdgpu_kernel void @unparseable_single_1() #1 { entry: ret void } attributes #1 = {"amdgpu-num-sgpr"="k"} ; CHECK: can't parse integer attribute amdgpu-num-sgpr -define void @unparseable_single_2() #2 { +define amdgpu_kernel void @unparseable_single_2() #2 { entry: ret void } attributes #2 = {"amdgpu-num-sgpr"="1,2"} ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_0() #3 { +define amdgpu_kernel void @unparseable_pair_0() #3 { entry: ret void } attributes #3 = {"amdgpu-flat-work-group-size"} ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_1() #4 { +define amdgpu_kernel void @unparseable_pair_1() #4 { entry: ret void } attributes #4 = {"amdgpu-flat-work-group-size"="k"} ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_2() #5 { +define amdgpu_kernel void @unparseable_pair_2() #5 { entry: ret void } attributes #5 = {"amdgpu-flat-work-group-size"="1"} ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_3() #6 { +define amdgpu_kernel void @unparseable_pair_3() #6 { entry: ret void } attributes #6 = {"amdgpu-flat-work-group-size"="1,k"} ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size -define void @unparseable_pair_4() #7 { +define amdgpu_kernel void @unparseable_pair_4() #7 { entry: ret void } diff --git a/test/CodeGen/AMDGPU/barrier-elimination.ll b/test/CodeGen/AMDGPU/barrier-elimination.ll new file mode 100644 index 000000000000..c526baaab9cd --- /dev/null +++ b/test/CodeGen/AMDGPU/barrier-elimination.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s + +; CHECK-LABEL: {{^}}unknown_wgs: +; CHECK: s_barrier +define amdgpu_kernel void @unknown_wgs() { + tail call void @llvm.amdgcn.s.barrier() #0 + ret void +} + +; CHECK-LABEL: {{^}}flat_wgs_attr_32_128: +; CHECK: s_barrier +define amdgpu_kernel void @flat_wgs_attr_32_128() #1 { + tail call void @llvm.amdgcn.s.barrier() #0 + ret void +} + +; CHECK-LABEL: {{^}}flat_wgs_attr_32_64: +; CHECK: : +; CHECK-NEXT: ; wave barrier +; CHECK-NEXT: s_endpgm +define amdgpu_kernel void @flat_wgs_attr_32_64() #2 { + tail call void @llvm.amdgcn.s.barrier() #0 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #0 + +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind "amdgpu-flat-work-group-size"="32,128" } +attributes #2 = { nounwind "amdgpu-flat-work-group-size"="32,64" } diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll index 104dd45e8a1a..e245e4296df2 100644 --- a/test/CodeGen/AMDGPU/basic-branch.ll +++ b/test/CodeGen/AMDGPU/basic-branch.ll @@ -8,17 +8,14 @@ ; GCNNOOPT: v_writelane_b32 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] - -; GCN: ; BB#1 ; GCNNOOPT: v_readlane_b32 ; GCNNOOPT: v_readlane_b32 ; GCN: buffer_store_dword -; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; TODO: This waitcnt can be eliminated +; GCNNOOPT: s_endpgm ; GCN: {{^}}[[END]]: ; GCN: s_endpgm -define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 { +define amdgpu_kernel void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 { %cmp = icmp ne i32 %val, 0 br i1 %cmp, label %store, label %end @@ -42,7 +39,7 @@ end: ; GCN: {{^}}[[END]]: ; GCN: s_endpgm -define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 { +define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 { %cmp0 = icmp ne i1 %val, 0 br i1 %cmp0, label %store, label %end diff --git a/test/CodeGen/AMDGPU/basic-loop.ll b/test/CodeGen/AMDGPU/basic-loop.ll index f0263caf5d6b..de45190cdaa5 100644 --- a/test/CodeGen/AMDGPU/basic-loop.ll +++ b/test/CodeGen/AMDGPU/basic-loop.ll @@ -2,7 +2,7 @@ ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s ; CHECK-LABEL: {{^}}test_loop: -define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { +define amdgpu_kernel void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { entry: br label %loop.body diff --git a/test/CodeGen/AMDGPU/bfe-patterns.ll b/test/CodeGen/AMDGPU/bfe-patterns.ll new file mode 100644 index 000000000000..c23cc1c88b52 --- /dev/null +++ b/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -0,0 +1,163 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}v_ubfe_sub_i32: +; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] +; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]] +define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x + %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %src = load volatile i32, i32 addrspace(1)* %in0.gep + %width = load volatile i32, i32 addrspace(1)* %in0.gep + %sub = sub i32 32, %width + %shl = shl i32 %src, %sub + %bfe = lshr i32 %shl, %sub + store i32 %bfe, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_ubfe_sub_multi_use_shl_i32: +; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] +; GCN: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]] + +; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]] +; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]] + +; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] +; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] + +; GCN: [[BFE]] +; GCN: [[SHL]] +define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x + %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %src = load volatile i32, i32 addrspace(1)* %in0.gep + %width = load volatile i32, i32 addrspace(1)* %in0.gep + %sub = sub i32 32, %width + %shl = shl i32 %src, %sub + %bfe = lshr i32 %shl, %sub + store i32 %bfe, i32 addrspace(1)* %out.gep + store volatile i32 %shl, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}s_ubfe_sub_i32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: s_load_dword [[WIDTH:s[0-9]+]] +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} +; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] +define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %sub = sub i32 32, %width + %shl = shl i32 %src, %sub + %bfe = lshr i32 %shl, %sub + store i32 %bfe, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: s_load_dword [[WIDTH:s[0-9]+]] +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]] +; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]] +; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]] +define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %sub = sub i32 32, %width + %shl = shl i32 %src, %sub + %bfe = lshr i32 %shl, %sub + store i32 %bfe, i32 addrspace(1)* %out.gep + store volatile i32 %shl, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_sbfe_sub_i32: +; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] +; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]] +define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x + %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %src = load volatile i32, i32 addrspace(1)* %in0.gep + %width = load volatile i32, i32 addrspace(1)* %in0.gep + %sub = sub i32 32, %width + %shl = shl i32 %src, %sub + %bfe = ashr i32 %shl, %sub + store i32 %bfe, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_sbfe_sub_multi_use_shl_i32: +; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] +; GCN: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]] + +; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]] +; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]] + +; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] +; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] + +; GCN: [[BFE]] +; GCN: [[SHL]] +define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x + %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %src = load volatile i32, i32 addrspace(1)* %in0.gep + %width = load volatile i32, i32 addrspace(1)* %in0.gep + %sub = sub i32 32, %width + %shl = shl i32 %src, %sub + %bfe = ashr i32 %shl, %sub + store i32 %bfe, i32 addrspace(1)* %out.gep + store volatile i32 %shl, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}s_sbfe_sub_i32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: s_load_dword [[WIDTH:s[0-9]+]] +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} +; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] +define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %sub = sub i32 32, %width + %shl = shl i32 %src, %sub + %bfe = ashr i32 %shl, %sub + store i32 %bfe, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: s_load_dword [[WIDTH:s[0-9]+]] +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]] +; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]] +; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]] +define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %sub = sub i32 32, %width + %shl = shl i32 %src, %sub + %bfe = ashr i32 %shl, %sub + store i32 %bfe, i32 addrspace(1)* %out.gep + store volatile i32 %shl, i32 addrspace(1)* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/bfe_uint.ll b/test/CodeGen/AMDGPU/bfe_uint.ll index 32e3fc26106f..2c8c9a5ec932 100644 --- a/test/CodeGen/AMDGPU/bfe_uint.ll +++ b/test/CodeGen/AMDGPU/bfe_uint.ll @@ -2,7 +2,7 @@ ; CHECK: {{^}}bfe_def: ; CHECK: BFE_UINT -define void @bfe_def(i32 addrspace(1)* %out, i32 %x) { +define amdgpu_kernel void @bfe_def(i32 addrspace(1)* %out, i32 %x) { entry: %0 = lshr i32 %x, 5 %1 = and i32 %0, 15 ; 0xf @@ -17,7 +17,7 @@ entry: ; CHECK: {{^}}bfe_shift: ; CHECK-NOT: BFE_UINT -define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) { +define amdgpu_kernel void @bfe_shift(i32 addrspace(1)* %out, i32 %x) { entry: %0 = lshr i32 %x, 16 %1 = and i32 %0, 65535 ; 0xffff diff --git a/test/CodeGen/AMDGPU/bfi_int.ll b/test/CodeGen/AMDGPU/bfi_int.ll index 5156137fd78a..7870e5f378d3 100644 --- a/test/CodeGen/AMDGPU/bfi_int.ll +++ b/test/CodeGen/AMDGPU/bfi_int.ll @@ -9,7 +9,7 @@ ; R600: BFI_INT ; SI: @bfi_def ; SI: v_bfi_b32 -define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { entry: %0 = xor i32 %x, -1 %1 = and i32 %z, %0 @@ -25,7 +25,7 @@ entry: ; R600: BFI_INT ; SI: @bfi_sha256_ch ; SI: v_bfi_b32 -define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { entry: %0 = xor i32 %y, %z %1 = and i32 %x, %0 @@ -42,7 +42,7 @@ entry: ; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}} ; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}} -define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { entry: %0 = and i32 %x, %z %1 = or i32 %x, %z diff --git a/test/CodeGen/AMDGPU/bfm.ll b/test/CodeGen/AMDGPU/bfm.ll index 790458d0d60c..5673995588da 100644 --- a/test/CodeGen/AMDGPU/bfm.ll +++ b/test/CodeGen/AMDGPU/bfm.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}bfm_pattern: ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %a = shl i32 1, %x %b = sub i32 %a, 1 %c = shl i32 %b, %y @@ -14,7 +14,7 @@ define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { ; FUNC-LABEL: {{^}}bfm_pattern_simple: ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0 -define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 { %a = shl i32 1, %x %b = sub i32 %a, 1 store i32 %b, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/big_alu.ll b/test/CodeGen/AMDGPU/big_alu.ll index 0ab22b350f50..51387c8b79cb 100644 --- a/test/CodeGen/AMDGPU/big_alu.ll +++ b/test/CodeGen/AMDGPU/big_alu.ll @@ -2,7 +2,7 @@ ; This test ensures that R600 backend can handle ifcvt properly -define amdgpu_ps void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) { +define amdgpu_ps void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 { main_body: %tmp = extractelement <4 x float> %reg0, i32 0 %tmp1 = extractelement <4 x float> %reg0, i32 1 @@ -224,28 +224,31 @@ ENDIF136: ; preds = %ENDIF154, %main_bod %result.i = fadd float %mul.i, %one.sub.ac.i %tmp204 = fadd float %result.i, 0x3FF4CCCCC0000000 %tmp205 = fmul float %tmp204, 0x3FE1C71C80000000 - %tmp206 = call float @llvm.AMDGPU.clamp.f32(float %tmp205, float 0.000000e+00, float 1.000000e+00) + %max.0.i = call float @llvm.maxnum.f32(float %tmp205, float 0.000000e+00) + %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00) %tmp207 = fadd float %result.i, 0x3FF4CCCCC0000000 %tmp208 = fmul float %tmp207, 0x3FE1C71C80000000 - %tmp209 = call float @llvm.AMDGPU.clamp.f32(float %tmp208, float 0.000000e+00, float 1.000000e+00) + %max.0.i15 = call float @llvm.maxnum.f32(float %tmp208, float 0.000000e+00) + %clamp.i16 = call float @llvm.minnum.f32(float %max.0.i15, float 1.000000e+00) %tmp210 = fadd float %result.i, 2.000000e+00 %tmp211 = fmul float %tmp210, 0x3FD611A7A0000000 - %tmp212 = call float @llvm.AMDGPU.clamp.f32(float %tmp211, float 0.000000e+00, float 1.000000e+00) - %tmp213 = fmul float 2.000000e+00, %tmp206 + %max.0.i13 = call float @llvm.maxnum.f32(float %tmp211, float 0.000000e+00) + %clamp.i14 = call float @llvm.minnum.f32(float %max.0.i13, float 1.000000e+00) + %tmp213 = fmul float 2.000000e+00, %clamp.i %tmp214 = fsub float -0.000000e+00, %tmp213 %tmp215 = fadd float 3.000000e+00, %tmp214 - %tmp216 = fmul float %tmp206, %tmp215 - %tmp217 = fmul float %tmp206, %tmp216 - %tmp218 = fmul float 2.000000e+00, %tmp209 + %tmp216 = fmul float %clamp.i, %tmp215 + %tmp217 = fmul float %clamp.i, %tmp216 + %tmp218 = fmul float 2.000000e+00, %clamp.i16 %tmp219 = fsub float -0.000000e+00, %tmp218 %tmp220 = fadd float 3.000000e+00, %tmp219 - %tmp221 = fmul float %tmp209, %tmp220 - %tmp222 = fmul float %tmp209, %tmp221 - %tmp223 = fmul float 2.000000e+00, %tmp212 + %tmp221 = fmul float %clamp.i16, %tmp220 + %tmp222 = fmul float %clamp.i16, %tmp221 + %tmp223 = fmul float 2.000000e+00, %clamp.i14 %tmp224 = fsub float -0.000000e+00, %tmp223 %tmp225 = fadd float 3.000000e+00, %tmp224 - %tmp226 = fmul float %tmp212, %tmp225 - %tmp227 = fmul float %tmp212, %tmp226 + %tmp226 = fmul float %clamp.i14, %tmp225 + %tmp227 = fmul float %clamp.i14, %tmp226 %tmp228 = fmul float %tmp26, 0x3F368B5CC0000000 %tmp229 = fmul float %tmp27, 0x3F368B5CC0000000 %tmp230 = insertelement <4 x float> undef, float %tmp228, i32 0 @@ -282,28 +285,31 @@ ENDIF136: ; preds = %ENDIF154, %main_bod %tmp261 = fmul float %tmp257, 0.000000e+00 %tmp262 = fadd float %result.i, 0x3FF4CCCCC0000000 %tmp263 = fmul float %tmp262, 0x3FE1C71C80000000 - %tmp264 = call float @llvm.AMDGPU.clamp.f32(float %tmp263, float 0.000000e+00, float 1.000000e+00) + %max.0.i11 = call float @llvm.maxnum.f32(float %tmp263, float 0.000000e+00) + %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00) %tmp265 = fadd float %result.i, 0x3FF4CCCCC0000000 %tmp266 = fmul float %tmp265, 0x3FE1C71C80000000 - %tmp267 = call float @llvm.AMDGPU.clamp.f32(float %tmp266, float 0.000000e+00, float 1.000000e+00) + %max.0.i9 = call float @llvm.maxnum.f32(float %tmp266, float 0.000000e+00) + %clamp.i10 = call float @llvm.minnum.f32(float %max.0.i9, float 1.000000e+00) %tmp268 = fadd float %result.i, 2.000000e+00 %tmp269 = fmul float %tmp268, 0x3FD611A7A0000000 - %tmp270 = call float @llvm.AMDGPU.clamp.f32(float %tmp269, float 0.000000e+00, float 1.000000e+00) - %tmp271 = fmul float 2.000000e+00, %tmp264 + %max.0.i7 = call float @llvm.maxnum.f32(float %tmp269, float 0.000000e+00) + %clamp.i8 = call float @llvm.minnum.f32(float %max.0.i7, float 1.000000e+00) + %tmp271 = fmul float 2.000000e+00, %clamp.i12 %tmp272 = fsub float -0.000000e+00, %tmp271 %tmp273 = fadd float 3.000000e+00, %tmp272 - %tmp274 = fmul float %tmp264, %tmp273 - %tmp275 = fmul float %tmp264, %tmp274 - %tmp276 = fmul float 2.000000e+00, %tmp267 + %tmp274 = fmul float %clamp.i12, %tmp273 + %tmp275 = fmul float %clamp.i12, %tmp274 + %tmp276 = fmul float 2.000000e+00, %clamp.i10 %tmp277 = fsub float -0.000000e+00, %tmp276 %tmp278 = fadd float 3.000000e+00, %tmp277 - %tmp279 = fmul float %tmp267, %tmp278 - %tmp280 = fmul float %tmp267, %tmp279 - %tmp281 = fmul float 2.000000e+00, %tmp270 + %tmp279 = fmul float %clamp.i10, %tmp278 + %tmp280 = fmul float %clamp.i10, %tmp279 + %tmp281 = fmul float 2.000000e+00, %clamp.i8 %tmp282 = fsub float -0.000000e+00, %tmp281 %tmp283 = fadd float 3.000000e+00, %tmp282 - %tmp284 = fmul float %tmp270, %tmp283 - %tmp285 = fmul float %tmp270, %tmp284 + %tmp284 = fmul float %clamp.i8, %tmp283 + %tmp285 = fmul float %clamp.i8, %tmp284 %tmp286 = fmul float %tmp26, 0x3F22DFD6A0000000 %tmp287 = fmul float %tmp27, 0x3F22DFD6A0000000 %tmp288 = insertelement <4 x float> undef, float %tmp286, i32 0 @@ -390,7 +396,8 @@ ENDIF136: ; preds = %ENDIF154, %main_bod %tmp369 = fadd float %tmp368, %tmp367 %tmp370 = fadd float %tmp369, 0xBFEFAE1480000000 %tmp371 = fmul float %tmp370, 0xC023FFFFC0000000 - %tmp372 = call float @llvm.AMDGPU.clamp.f32(float %tmp371, float 0.000000e+00, float 1.000000e+00) + %max.0.i5 = call float @llvm.maxnum.f32(float %tmp371, float 0.000000e+00) + %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00) %tmp373 = fsub float -0.000000e+00, %tmp339 %tmp374 = fadd float %result.i, %tmp373 %tmp375 = fadd float %tmp374, 0x3FBEB851E0000000 @@ -416,12 +423,13 @@ ENDIF136: ; preds = %ENDIF154, %main_bod %tmp395 = fadd float %tmp394, %tmp393 %tmp396 = fadd float %tmp395, 0xBFEFAE1480000000 %tmp397 = fmul float %tmp396, 0xC0490001A0000000 - %tmp398 = call float @llvm.AMDGPU.clamp.f32(float %tmp397, float 0.000000e+00, float 1.000000e+00) - %tmp399 = fmul float 2.000000e+00, %tmp372 + %max.0.i3 = call float @llvm.maxnum.f32(float %tmp397, float 0.000000e+00) + %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00) + %tmp399 = fmul float 2.000000e+00, %clamp.i6 %tmp400 = fsub float -0.000000e+00, %tmp399 %tmp401 = fadd float 3.000000e+00, %tmp400 - %tmp402 = fmul float %tmp372, %tmp401 - %tmp403 = fmul float %tmp372, %tmp402 + %tmp402 = fmul float %clamp.i6, %tmp401 + %tmp403 = fmul float %clamp.i6, %tmp402 %one.sub.a.i169 = fsub float 1.000000e+00, %tmp403 %one.sub.ac.i170 = fmul float %one.sub.a.i169, %tmp349 %mul.i171 = fmul float %tmp258, %tmp349 @@ -438,11 +446,11 @@ ENDIF136: ; preds = %ENDIF154, %main_bod %one.sub.ac.i158 = fmul float %one.sub.a.i157, 0.000000e+00 %mul.i159 = fmul float %tmp261, 0.000000e+00 %result.i160 = fadd float %mul.i159, %one.sub.ac.i158 - %tmp404 = fmul float 2.000000e+00, %tmp398 + %tmp404 = fmul float 2.000000e+00, %clamp.i4 %tmp405 = fsub float -0.000000e+00, %tmp404 %tmp406 = fadd float 3.000000e+00, %tmp405 - %tmp407 = fmul float %tmp398, %tmp406 - %tmp408 = fmul float %tmp398, %tmp407 + %tmp407 = fmul float %clamp.i4, %tmp406 + %tmp408 = fmul float %clamp.i4, %tmp407 %one.sub.a.i153 = fsub float 1.000000e+00, %tmp408 %one.sub.ac.i154 = fmul float %one.sub.a.i153, %tmp375 %mul.i155 = fmul float %tmp258, %tmp375 @@ -1157,12 +1165,13 @@ IF179: ; preds = %ENDIF175 %tmp882 = fadd float %tmp881, %tmp880 %tmp883 = fadd float %tmp882, 0xBFEFAE1480000000 %tmp884 = fmul float %tmp883, 0xC043FFFE20000000 - %tmp885 = call float @llvm.AMDGPU.clamp.f32(float %tmp884, float 0.000000e+00, float 1.000000e+00) - %tmp886 = fmul float 2.000000e+00, %tmp885 + %max.0.i1 = call float @llvm.maxnum.f32(float %tmp884, float 0.000000e+00) + %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) + %tmp886 = fmul float 2.000000e+00, %clamp.i2 %tmp887 = fsub float -0.000000e+00, %tmp886 %tmp888 = fadd float 3.000000e+00, %tmp887 - %tmp889 = fmul float %tmp885, %tmp888 - %tmp890 = fmul float %tmp885, %tmp889 + %tmp889 = fmul float %clamp.i2, %tmp888 + %tmp890 = fmul float %clamp.i2, %tmp889 %one.sub.a.i41 = fsub float 1.000000e+00, %tmp890 %one.sub.ac.i42 = fmul float %one.sub.a.i41, %tmp866 %mul.i43 = fmul float %temp84.5, %tmp866 @@ -1288,25 +1297,14 @@ ENDIF178: ; preds = %IF179, %ENDIF175 ret void } -; Function Attrs: nounwind readnone -declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.r600.recipsqrt.clamped.f32(float) #0 - -; Function Attrs: nounwind readonly +declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1 +declare float @llvm.r600.recipsqrt.clamped.f32(float) #1 declare float @llvm.fabs.f32(float) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.exp2.f32(float) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0 - +declare float @llvm.exp2.f32(float) #1 declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) +declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind readonly } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll index 3a55870c2882..cf95f74afb84 100644 --- a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll +++ b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll @@ -11,7 +11,7 @@ ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 -define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) { +define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) { %vec0.bc = bitcast <8 x i32> to <8 x float> store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out @@ -27,7 +27,7 @@ define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %ou ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 -define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) { +define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) { %vec0.bc = bitcast <4 x i64> to <8 x float> store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out @@ -43,7 +43,7 @@ define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %ou ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 -define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) { +define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) { %vec0.bc = bitcast <4 x i64> to <4 x double> store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out @@ -59,7 +59,7 @@ define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %o ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 -define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) { +define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) { %vec0.bc = bitcast <16 x i16> to <8 x float> store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out @@ -67,3 +67,27 @@ define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %o store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source: +; GCN-NOT: store_dword +define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 { + %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1 + %bc = bitcast i64 %undef to <2 x i32> + store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt: +; GCN-NOT: store_dword +define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 { + %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1 + %bc = bitcast i64 %undef to <2 x i32> + %elt1 = extractelement <2 x i32> %bc, i32 1 + store volatile i32 %elt1, i32 addrspace(1)* %out + ret void +} + +declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone convergent } diff --git a/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll index f7dc1a9d37e8..3616ec1f45d3 100644 --- a/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll +++ b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}materialize_0_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_0_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) { store i32 0, i32 addrspace(1)* %out ret void } @@ -16,7 +16,7 @@ define void @materialize_0_i32(i32 addrspace(1)* %out) { ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_0_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) { store i64 0, i64 addrspace(1)* %out ret void } @@ -24,7 +24,7 @@ define void @materialize_0_i64(i64 addrspace(1)* %out) { ; GCN-LABEL: {{^}}materialize_neg1_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_neg1_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) { store i32 -1, i32 addrspace(1)* %out ret void } @@ -33,7 +33,7 @@ define void @materialize_neg1_i32(i32 addrspace(1)* %out) { ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}} ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_neg1_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) { store i64 -1, i64 addrspace(1)* %out ret void } @@ -41,7 +41,7 @@ define void @materialize_neg1_i64(i64 addrspace(1)* %out) { ; GCN-LABEL: {{^}}materialize_signbit_i32: ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_signbit_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) { store i32 -2147483648, i32 addrspace(1)* %out ret void } @@ -50,7 +50,7 @@ define void @materialize_signbit_i32(i32 addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_signbit_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) { store i64 -9223372036854775808, i64 addrspace(1)* %out ret void } @@ -58,7 +58,7 @@ define void @materialize_signbit_i64(i64 addrspace(1)* %out) { ; GCN-LABEL: {{^}}materialize_rev_neg16_i32: ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) { store i32 268435455, i32 addrspace(1)* %out ret void } @@ -67,7 +67,7 @@ define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) { store i64 1152921504606846975, i64 addrspace(1)* %out ret void } @@ -75,7 +75,7 @@ define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) { ; GCN-LABEL: {{^}}materialize_rev_neg17_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) { store i32 -134217729, i32 addrspace(1)* %out ret void } @@ -84,7 +84,7 @@ define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) { store i64 -576460752303423489, i64 addrspace(1)* %out ret void } @@ -92,7 +92,7 @@ define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) { ; GCN-LABEL: {{^}}materialize_rev_64_i32: ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_64_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) { store i32 33554432, i32 addrspace(1)* %out ret void } @@ -101,7 +101,7 @@ define void @materialize_rev_64_i32(i32 addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_64_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) { store i64 144115188075855872, i64 addrspace(1)* %out ret void } @@ -109,7 +109,7 @@ define void @materialize_rev_64_i64(i64 addrspace(1)* %out) { ; GCN-LABEL: {{^}}materialize_rev_65_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_65_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) { store i32 -2113929216, i32 addrspace(1)* %out ret void } @@ -118,7 +118,7 @@ define void @materialize_rev_65_i32(i32 addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_65_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) { store i64 -9079256848778919936, i64 addrspace(1)* %out ret void } @@ -126,7 +126,7 @@ define void @materialize_rev_65_i64(i64 addrspace(1)* %out) { ; GCN-LABEL: {{^}}materialize_rev_3_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_3_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) { store i32 -1073741824, i32 addrspace(1)* %out ret void } @@ -135,7 +135,7 @@ define void @materialize_rev_3_i32(i32 addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_3_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) { store i64 -4611686018427387904, i64 addrspace(1)* %out ret void } @@ -143,7 +143,7 @@ define void @materialize_rev_3_i64(i64 addrspace(1)* %out) { ; GCN-LABEL: {{^}}materialize_rev_1.0_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}} ; GCN: buffer_store_dword [[K]] -define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) { store i32 508, i32 addrspace(1)* %out ret void } @@ -152,70 +152,70 @@ define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}} -define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) { +define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) { store i64 508, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}s_materialize_0_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, 0{{$}} -define void @s_materialize_0_i32() { +define amdgpu_kernel void @s_materialize_0_i32() { call void asm sideeffect "; use $0", "s"(i32 0) ret void } ; GCN-LABEL: {{^}}s_materialize_1_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, 1{{$}} -define void @s_materialize_1_i32() { +define amdgpu_kernel void @s_materialize_1_i32() { call void asm sideeffect "; use $0", "s"(i32 1) ret void } ; GCN-LABEL: {{^}}s_materialize_neg1_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, -1{{$}} -define void @s_materialize_neg1_i32() { +define amdgpu_kernel void @s_materialize_neg1_i32() { call void asm sideeffect "; use $0", "s"(i32 -1) ret void } ; GCN-LABEL: {{^}}s_materialize_signbit_i32: ; GCN: s_brev_b32 s{{[0-9]+}}, 1{{$}} -define void @s_materialize_signbit_i32() { +define amdgpu_kernel void @s_materialize_signbit_i32() { call void asm sideeffect "; use $0", "s"(i32 -2147483648) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_64_i32: ; GCN: s_brev_b32 s{{[0-9]+}}, 64{{$}} -define void @s_materialize_rev_64_i32() { +define amdgpu_kernel void @s_materialize_rev_64_i32() { call void asm sideeffect "; use $0", "s"(i32 33554432) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_65_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, 0x82000000{{$}} -define void @s_materialize_rev_65_i32() { +define amdgpu_kernel void @s_materialize_rev_65_i32() { call void asm sideeffect "; use $0", "s"(i32 -2113929216) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_neg16_i32: ; GCN: s_brev_b32 s{{[0-9]+}}, -16{{$}} -define void @s_materialize_rev_neg16_i32() { +define amdgpu_kernel void @s_materialize_rev_neg16_i32() { call void asm sideeffect "; use $0", "s"(i32 268435455) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_neg17_i32: ; GCN: s_mov_b32 s{{[0-9]+}}, 0xf7ffffff{{$}} -define void @s_materialize_rev_neg17_i32() { +define amdgpu_kernel void @s_materialize_rev_neg17_i32() { call void asm sideeffect "; use $0", "s"(i32 -134217729) ret void } ; GCN-LABEL: {{^}}s_materialize_rev_1.0_i32: ; GCN: s_movk_i32 s{{[0-9]+}}, 0x1fc{{$}} -define void @s_materialize_rev_1.0_i32() { +define amdgpu_kernel void @s_materialize_rev_1.0_i32() { call void asm sideeffect "; use $0", "s"(i32 508) ret void } diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll index 43a4200cb3bd..539373f7bdeb 100644 --- a/test/CodeGen/AMDGPU/bitreverse.ll +++ b/test/CodeGen/AMDGPU/bitreverse.ll @@ -14,7 +14,7 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 ; FUNC-LABEL: {{^}}s_brev_i16: ; SI: s_brev_b32 -define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { +define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 store i16 %brev, i16 addrspace(1)* %out ret void @@ -22,7 +22,7 @@ define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { ; FUNC-LABEL: {{^}}v_brev_i16: ; SI: v_bfrev_b32_e32 -define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { %val = load i16, i16 addrspace(1)* %valptr %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 store i16 %brev, i16 addrspace(1)* %out @@ -35,7 +35,7 @@ define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalia ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], ; SI: s_endpgm -define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { +define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 store i32 %brev, i32 addrspace(1)* %out ret void @@ -46,7 +46,7 @@ define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { %val = load i32, i32 addrspace(1)* %valptr %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 store i32 %brev, i32 addrspace(1)* %out @@ -56,7 +56,7 @@ define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia ; FUNC-LABEL: {{^}}s_brev_v2i32: ; SI: s_brev_b32 ; SI: s_brev_b32 -define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { +define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out ret void @@ -65,7 +65,7 @@ define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) ; FUNC-LABEL: {{^}}v_brev_v2i32: ; SI: v_bfrev_b32_e32 ; SI: v_bfrev_b32_e32 -define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out @@ -73,7 +73,7 @@ define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp } ; FUNC-LABEL: {{^}}s_brev_i64: -define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { +define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, i64 addrspace(1)* %out ret void @@ -81,7 +81,7 @@ define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { ; FUNC-LABEL: {{^}}v_brev_i64: ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0 -define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { %val = load i64, i64 addrspace(1)* %valptr %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, i64 addrspace(1)* %out @@ -89,14 +89,14 @@ define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia } ; FUNC-LABEL: {{^}}s_brev_v2i64: -define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { +define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_brev_v2i64: -define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/br_cc.f16.ll b/test/CodeGen/AMDGPU/br_cc.f16.ll index 0072d384f217..b7a0c8738dfa 100644 --- a/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -1,27 +1,26 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; GCN-LABEL: {{^}}br_cc_f16 +; GCN-LABEL: {{^}}br_cc_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cmp_nlt_f32_e32 vcc, v[[B_F32]], v[[A_F32]] ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; GCN: s_cbranch_vccnz ; GCN: one{{$}} -; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]] -; SI: s_branch -; VI: buffer_store_short -; VI: s_endpgm +; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[B_F32]] +; GCN: buffer_store_short +; GCN: s_endpgm ; GCN: two{{$}} -; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]] +; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[B_F16]] ; GCN: s_endpgm -define void @br_cc_f16( +define amdgpu_kernel void @br_cc_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -40,29 +39,27 @@ two: ret void } -; GCN-LABEL: {{^}}br_cc_f16_imm_a -; SI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN-LABEL: {{^}}br_cc_f16_imm_a: ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]] -; SI: s_cbranch_vccz +; SI: v_cmp_nlt_f32_e32 vcc, 0.5, v[[B_F32]] +; SI: s_cbranch_vccnz ; VI: v_cmp_nlt_f16_e32 vcc, 0.5, v[[B_F16]] ; VI: s_cbranch_vccnz -; VI: one{{$}} +; GCN: one{{$}} ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}} -; GCN: two{{$}} -; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]] - -; SI: one{{$}} ; SI: buffer_store_short v[[A_F16]] ; SI: s_endpgm -define void @br_cc_f16_imm_a( + +; GCN: two{{$}} +; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]] + +define amdgpu_kernel void @br_cc_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -79,13 +76,12 @@ two: ret void } -; GCN-LABEL: {{^}}br_cc_f16_imm_b -; SI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}} -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; GCN-LABEL: {{^}}br_cc_f16_imm_b: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; SI: v_cmp_ngt_f32_e32 vcc, 0.5, v[[A_F32]] + ; VI: v_cmp_ngt_f16_e32 vcc, 0.5, v[[A_F16]] ; GCN: s_cbranch_vccnz @@ -96,7 +92,7 @@ two: ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}} ; GCN: buffer_store_short v[[B_F16]] ; GCN: s_endpgm -define void @br_cc_f16_imm_b( +define amdgpu_kernel void @br_cc_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/branch-condition-and.ll b/test/CodeGen/AMDGPU/branch-condition-and.ll index 94616a4be8fd..68b77ea3490e 100644 --- a/test/CodeGen/AMDGPU/branch-condition-and.ll +++ b/test/CodeGen/AMDGPU/branch-condition-and.ll @@ -15,12 +15,16 @@ ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]] ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]] ; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]] -; -; TODO: The following sequence is a bug (missing s_endpgm)! -; -; GCN: s_branch [[BB:BB[0-9]+_[0-9]+]] -; GCN: [[BB]]: -; GCN-NEXT: .Lfunc_end0: +; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4 +; GCN: ds_write_b32 +; GCN: s_waitcnt + +; GCN-NEXT: [[BB5]] +; GCN: s_or_b64 exec, exec +; GCN-NEXT: s_endpgm +; GCN-NEXT: .Lfunc_end define amdgpu_ps void @ham(float %arg, float %arg1) #0 { bb: %tmp = fcmp ogt float %arg, 0.000000e+00 @@ -29,6 +33,7 @@ bb: br i1 %tmp3, label %bb4, label %bb5 bb4: ; preds = %bb + store volatile i32 4, i32 addrspace(3)* undef unreachable bb5: ; preds = %bb diff --git a/test/CodeGen/AMDGPU/branch-relax-spill.ll b/test/CodeGen/AMDGPU/branch-relax-spill.ll index 86b8dd89e7d0..ede15559c4ff 100644 --- a/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -5,7 +5,7 @@ ; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot! -define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 { +define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 { entry: %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0 %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0 diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll index 39505404a868..263059d4a6ed 100644 --- a/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -26,7 +26,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] ; GCN: buffer_store_dword [[V_CND]] ; GCN: s_endpgm -define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { +define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { bb: %cmp = icmp eq i32 %cnd, 0 br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch @@ -68,7 +68,7 @@ bb3: ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] ; GCN: buffer_store_dword [[V_CND]] ; GCN: s_endpgm -define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { +define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { bb0: %cmp = icmp eq i32 %cnd, 0 br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch @@ -108,7 +108,7 @@ bb3: ; GCN: [[ENDBB]]: ; GCN: buffer_store_dword [[V_CND]] ; GCN: s_endpgm -define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 { +define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 { bb0: %cmp = fcmp oeq float %cnd, 0.0 br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch @@ -141,7 +141,7 @@ bb3: ; GCN: s_or_b64 exec, exec, [[SAVE]] ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 { +define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = zext i32 %tid to i64 @@ -188,7 +188,7 @@ bb3: ; GCN-NEXT: [[ENDBB]]: ; GCN-NEXT: s_endpgm -define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 { +define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 { bb: br label %bb2 @@ -243,7 +243,7 @@ bb3: ; GCN: buffer_store_dword [[BB4_K]] ; GCN-NEXT: s_endpgm ; GCN-NEXT: .Lfunc_end{{[0-9]+}}: -define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) { +define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) { bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -285,7 +285,7 @@ bb4: ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}} ; GCN-NEXT: s_setpc_b64 vcc ; GCN-NEXT .Lfunc_end{{[0-9]+}}: -define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) { +define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) { entry: br label %loop @@ -335,8 +335,14 @@ loop: ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: [[BB3]]: ; %bb3 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm -define void @expand_requires_expand(i32 %cond0) #0 { +define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { bb0: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %cmp0 = icmp slt i32 %cond0, 0 @@ -356,6 +362,12 @@ bb2: br label %bb3 bb3: +; These NOPs prevent tail-duplication-based outlining +; from firing, which defeats the need to expand the branches and this test. + call void asm sideeffect + "v_nop_e64", ""() #0 + call void asm sideeffect + "v_nop_e64", ""() #0 ret void } @@ -385,8 +397,9 @@ bb3: ; GCN-NEXT: [[ENDIF]]: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] +; GCN-NEXT: s_sleep 5 ; GCN-NEXT: s_endpgm -define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 { +define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %d_cmp = icmp ult i32 %tid, 16 @@ -402,6 +415,9 @@ if_uniform: br label %endif endif: + ; layout can remove the split branch if it can copy the return block. + ; This call makes the return block long enough that it doesn't get copied. + call void @llvm.amdgcn.s.sleep(i32 5); ret void } @@ -446,7 +462,7 @@ endif: ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] ; GCN: buffer_store_dword ; GCN-NEXT: s_endpgm -define void @analyze_mask_branch() #0 { +define amdgpu_kernel void @analyze_mask_branch() #0 { entry: %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"() %cmp0 = fcmp ogt float %reg, 0.000000e+00 @@ -475,7 +491,8 @@ ret: ; GCN-LABEL: {{^}}long_branch_hang: ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6 -; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]] +; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}} +; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]] ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-( diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll index c68951731098..d2dacd7c17b3 100644 --- a/test/CodeGen/AMDGPU/bswap.ll +++ b/test/CodeGen/AMDGPU/bswap.ll @@ -17,7 +17,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone ; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone store i32 %bswap, i32 addrspace(1)* %out, align 4 @@ -32,7 +32,7 @@ define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw ; SI-DAG: v_alignbit_b32 ; SI-DAG: v_bfi_b32 ; SI: s_endpgm -define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8 @@ -53,7 +53,7 @@ define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace( ; SI-DAG: v_alignbit_b32 ; SI-DAG: v_bfi_b32 ; SI: s_endpgm -define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16 @@ -86,7 +86,7 @@ define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace( ; SI-DAG: v_alignbit_b32 ; SI-DAG: v_bfi_b32 ; SI: s_endpgm -define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32 @@ -95,21 +95,21 @@ define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace( ; FUNC-LABEL: {{^}}test_bswap_i64: ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0 -define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %val = load i64, i64 addrspace(1)* %in, align 8 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone store i64 %bswap, i64 addrspace(1)* %out, align 8 ret void } -define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16 ret void } -define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32 diff --git a/test/CodeGen/AMDGPU/build_vector.ll b/test/CodeGen/AMDGPU/build_vector.ll index 0a5774c601d3..d77b0ab9fbb6 100644 --- a/test/CodeGen/AMDGPU/build_vector.ll +++ b/test/CodeGen/AMDGPU/build_vector.ll @@ -10,7 +10,7 @@ ; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 ; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}} -define void @build_vector2 (<2 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) { entry: store <2 x i32> , <2 x i32> addrspace(1)* %out ret void @@ -28,7 +28,7 @@ entry: ; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7 ; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8 ; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}} -define void @build_vector4 (<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) { entry: store <4 x i32> , <4 x i32> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/call.ll b/test/CodeGen/AMDGPU/call.ll index 6d101e1537cc..769c7bb3eee7 100644 --- a/test/CodeGen/AMDGPU/call.ll +++ b/test/CodeGen/AMDGPU/call.ll @@ -10,7 +10,7 @@ declare i32 @external_function(i32) nounwind -define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -25,7 +25,7 @@ define i32 @defined_function(i32 %x) nounwind noinline { ret i32 %y } -define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -35,7 +35,7 @@ define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ret void } -define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr diff --git a/test/CodeGen/AMDGPU/calling-conventions.ll b/test/CodeGen/AMDGPU/calling-conventions.ll index 57adc8be6a99..677147b6f4e5 100644 --- a/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/test/CodeGen/AMDGPU/calling-conventions.ll @@ -1,9 +1,10 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; Make sure we don't crash or assert on spir_kernel calling convention. -; SI-LABEL: {{^}}kernel: -; SI: s_endpgm +; GCN-LABEL: {{^}}kernel: +; GCN: s_endpgm define spir_kernel void @kernel(i32 addrspace(1)* %out) { entry: store i32 0, i32 addrspace(1)* %out @@ -11,10 +12,34 @@ entry: } ; FIXME: This is treated like a kernel -; SI-LABEL: {{^}}func: -; SI: s_endpgm -define spir_func void @func(i32 addrspace(1)* %out) { -entry: - store i32 0, i32 addrspace(1)* %out - ret void +; XGCN-LABEL: {{^}}func: +; XGCN: s_endpgm +; define spir_func void @func(i32 addrspace(1)* %out) { +; entry: +; store i32 0, i32 addrspace(1)* %out +; ret void +; } + +; GCN-LABEL: {{^}}ps_ret_cc_f16: +; SI: v_cvt_f16_f32_e32 v0, v0 +; SI: v_cvt_f32_f16_e32 v0, v0 +; SI: v_add_f32_e32 v0, 1.0, v0 + +; VI: v_add_f16_e32 v0, 1.0, v0 +; VI: ; return +define amdgpu_ps half @ps_ret_cc_f16(half %arg0) { + %add = fadd half %arg0, 1.0 + ret half %add +} + +; GCN-LABEL: {{^}}ps_ret_cc_inreg_f16: +; SI: v_cvt_f16_f32_e32 v0, s0 +; SI: v_cvt_f32_f16_e32 v0, v0 +; SI: v_add_f32_e32 v0, 1.0, v0 + +; VI: v_add_f16_e64 v0, s0, 1.0 +; VI: ; return +define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) { + %add = fadd half %arg0, 1.0 + ret half %add } diff --git a/test/CodeGen/AMDGPU/captured-frame-index.ll b/test/CodeGen/AMDGPU/captured-frame-index.ll index 49af159581f7..5fe1b2728506 100644 --- a/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,24 +1,24 @@ ; RUN: llc -mtriple=amdgcn-- -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}store_fi_lifetime: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] -define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 { entry: %b = alloca i8 - call void @llvm.lifetime.start(i64 1, i8* %b) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %b) store volatile i8* %b, i8* addrspace(1)* undef - call void @llvm.lifetime.end(i64 1, i8* %b) + call void @llvm.lifetime.end.p0i8(i64 1, i8* %b) ret void } ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] ; GCN: buffer_store_dword v{{[0-9]+}}, off, -; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}} ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] -define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { %tmp = alloca float store float 4.0, float *%tmp store float* %tmp, float* addrspace(3)* %ptr @@ -27,18 +27,18 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects: -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO]] -; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} +; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]] -define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 { %tmp0 = alloca float %tmp1 = alloca float store float 4.0, float* %tmp0 @@ -51,10 +51,10 @@ define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 { ; Same frame index is used multiple times in the store ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} -define void @stored_fi_to_self() #0 { +; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +define amdgpu_kernel void @stored_fi_to_self() #0 { %tmp = alloca i32* ; Avoid optimizing everything out @@ -66,14 +66,14 @@ define void @stored_fi_to_self() #0 { ; GCN-LABEL: {{^}}stored_fi_to_self_offset: ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}} -; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}} +; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} -; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}} -; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}} -define void @stored_fi_to_self_offset() #0 { +; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} +; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} +define amdgpu_kernel void @stored_fi_to_self_offset() #0 { %tmp0 = alloca [512 x i32] %tmp1 = alloca i32* @@ -89,16 +89,16 @@ define void @stored_fi_to_self_offset() #0 { } ; GCN-LABEL: {{^}}stored_fi_to_fi: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} -; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} -; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} +; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} -; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}} -; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} -define void @stored_fi_to_fi() #0 { +; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} +; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +define amdgpu_kernel void @stored_fi_to_fi() #0 { %tmp0 = alloca i32* %tmp1 = alloca i32* %tmp2 = alloca i32* @@ -115,10 +115,10 @@ define void @stored_fi_to_fi() #0 { } ; GCN-LABEL: {{^}}stored_fi_to_global: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] -define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 { %tmp = alloca float store float 0.0, float *%tmp store float* %tmp, float* addrspace(1)* %ptr @@ -127,16 +127,16 @@ define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 { ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} -; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} +; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}} +; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { %tmp0 = alloca float %tmp1 = alloca float %tmp2 = alloca float @@ -150,10 +150,10 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset: ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; FIXME: Re-initialize -; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}} ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]] @@ -163,7 +163,7 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { %tmp0 = alloca [4096 x i32] %tmp1 = alloca [4096 x i32] %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0 @@ -184,9 +184,9 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC_LO]], g1@gotpcrel32@lo+4 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+4 -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] -define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 { +define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 { entry: %b = alloca i32, align 4 %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4 @@ -196,8 +196,8 @@ entry: ret void } -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind } diff --git a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 0d919bbf85e3..697f26b83a4d 100644 --- a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -2,12 +2,12 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s ; GCN-LABEL: {{^}}test_loop: -; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: +; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}} ; GCN: ds_read_b32 ; GCN: ds_write_b32 ; GCN: s_branch [[LABEL]] ; GCN: s_endpgm -define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind { +define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind { entry: %cmp = icmp eq i32 %n, -1 br i1 %cmp, label %for.exit, label %for.body @@ -31,7 +31,7 @@ for.body: ; GCN: ds_read_b32 ; GCN: ds_write_b32 ; GCN: s_branch [[LABEL]] -define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind { +define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind { entry: br label %for.body @@ -52,7 +52,7 @@ for.body: ; GCN-LABEL: {{^}}loop_const_false: ; GCN-NOT: s_branch ; GCN: s_endpgm -define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind { +define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind { entry: br label %for.body @@ -74,7 +74,7 @@ for.body: ; GCN-LABEL: {{^}}loop_const_undef: ; GCN-NOT: s_branch ; GCN: s_endpgm -define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind { +define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind { entry: br label %for.body @@ -104,7 +104,7 @@ for.body: ; GCN: s_cbranch_vccnz [[LOOPBB]] ; GCN-NEXT: ; BB#2 ; GCN-NEXT: s_endpgm -define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind { +define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind { entry: br label %for.body diff --git a/test/CodeGen/AMDGPU/cf-stack-bug.ll b/test/CodeGen/AMDGPU/cf-stack-bug.ll index 75b87e486226..53fe89730f3a 100644 --- a/test/CodeGen/AMDGPU/cf-stack-bug.ll +++ b/test/CodeGen/AMDGPU/cf-stack-bug.ll @@ -35,7 +35,7 @@ ; BUG32-NOT: Applying bug work-around ; NOBUG-NOT: Applying bug work-around ; FUNC-LABEL: {{^}}nested3: -define void @nested3(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @nested3(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp sgt i32 %cond, 0 br i1 %0, label %if.1, label %end @@ -68,7 +68,7 @@ end: ; BUG32-NOT: Applying bug work-around ; NOBUG-NOT: Applying bug work-around ; FUNC-LABEL: {{^}}nested4: -define void @nested4(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @nested4(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp sgt i32 %cond, 0 br i1 %0, label %if.1, label %end @@ -109,7 +109,7 @@ end: ; BUG32-NOT: Applying bug work-around ; NOBUG-NOT: Applying bug work-around ; FUNC-LABEL: {{^}}nested7: -define void @nested7(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @nested7(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp sgt i32 %cond, 0 br i1 %0, label %if.1, label %end @@ -174,7 +174,7 @@ end: ; BUG32: Applying bug work-around ; NOBUG-NOT: Applying bug work-around ; FUNC-LABEL: {{^}}nested8: -define void @nested8(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @nested8(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp sgt i32 %cond, 0 br i1 %0, label %if.1, label %end diff --git a/test/CodeGen/AMDGPU/cf_end.ll b/test/CodeGen/AMDGPU/cf_end.ll index c74ee22868d5..3c990e0a4bd6 100644 --- a/test/CodeGen/AMDGPU/cf_end.ll +++ b/test/CodeGen/AMDGPU/cf_end.ll @@ -4,6 +4,6 @@ ; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80] ; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88] -define void @eop() { +define amdgpu_kernel void @eop() { ret void } diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll index 6db9a0761a01..cbdcf6aeaf42 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -11,7 +11,7 @@ ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: ; GCN: flat_load_dword ; GCN: {{^}}BB0_2: -define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { entry: %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 @@ -36,14 +36,14 @@ done: ; OPT-CI-NOT: getelementptr ; OPT: br i1 -; OPT-CI: ptrtoint -; OPT-CI: add -; OPT-CI: inttoptr +; OPT-CI: addrspacecast +; OPT-CI: getelementptr +; OPT-CI: bitcast ; OPT: br label ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32: ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 -define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { entry: %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 @@ -69,14 +69,14 @@ done: ; OPT-CI-NOT: getelementptr ; OPT: br i1 -; OPT-CI: ptrtoint -; OPT-CI: add -; OPT-CI: inttoptr +; OPT-CI: addrspacecast +; OPT-CI: getelementptr +; OPT-CI: bitcast ; OPT: br label ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32: ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { entry: %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index 2ed2857ff340..c1cf56e5058e 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -5,15 +5,17 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + ; OPT-LABEL: @test_sink_global_small_offset_i32( ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in ; OPT-VI: getelementptr i32, i32 addrspace(1)* %in ; OPT: br i1 -; OPT-CI: ptrtoint +; OPT-CI: getelementptr i8, ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: ; GCN: {{^}}BB0_2: -define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 @@ -43,7 +45,7 @@ done: ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} ; GCN: {{^}}BB1_2: ; GCN: s_or_b64 exec -define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 @@ -70,7 +72,7 @@ done: ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} ; GCN: {{^}}BB2_2: ; GCN: s_or_b64 exec -define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 @@ -97,7 +99,7 @@ done: ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} ; GCN: {{^}}BB3_2: ; GCN: s_or_b64 exec -define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 @@ -122,14 +124,55 @@ done: ; OPT-LABEL: @test_sink_scratch_small_offset_i32( ; OPT-NOT: getelementptr [512 x i32] ; OPT: br i1 -; OPT: ptrtoint +; OPT: getelementptr i8, ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: ; GCN: s_and_saveexec_b64 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} ; GCN: {{^}}BB4_2: -define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { +define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { +entry: + %alloca = alloca [512 x i32], align 4 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %add.arg = add i32 %arg, 8 + %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1022 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + store volatile i32 123, i32* %alloca.gep + %tmp1 = load volatile i32, i32* %alloca.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep.0 + %load = load volatile i32, i32* %alloca.gep + store i32 %load, i32 addrspace(1)* %out.gep.1 + br label %done + +done: + ret void +} + +; This ends up not fitting due to the reserved 4 bytes at offset 0 +; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved( +; OPT-NOT: getelementptr [512 x i32] +; OPT: br i1 +; OPT: getelementptr i8, + +; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved: +; GCN: s_and_saveexec_b64 +; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4 +; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4 +; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: {{^BB[0-9]+}}_2: + +define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 @@ -165,8 +208,8 @@ done: ; GCN: s_and_saveexec_b64 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: {{^}}BB5_2: -define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { +; GCN: {{^BB[0-9]+}}_2: +define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 @@ -197,8 +240,8 @@ done: ; GCN: s_and_saveexec_b64 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GCN: {{^}}BB6_2: -define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { +; GCN: {{^BB[0-9]+}}_2: +define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { entry: %offset.ext = zext i32 %offset to i64 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 @@ -228,7 +271,7 @@ done: ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7 @@ -257,7 +300,7 @@ done: ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255 @@ -290,7 +333,7 @@ done: ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256 @@ -322,7 +365,7 @@ done: ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295 @@ -353,7 +396,7 @@ done: ; GCN: s_addc_u32 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181 @@ -383,7 +426,7 @@ done: ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143 @@ -421,7 +464,7 @@ done: ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144 @@ -445,13 +488,13 @@ done: %struct.foo = type { [3 x float], [3 x float] } ; OPT-LABEL: @sink_ds_address( -; OPT: ptrtoint %struct.foo addrspace(3)* %ptr to i64 +; OPT: getelementptr i8, ; GCN-LABEL: {{^}}sink_ds_address: ; GCN: s_load_dword [[SREG1:s[0-9]+]], ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5 -define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind { +define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind { entry: %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2 @@ -476,9 +519,8 @@ bb34: ; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1( ; OPT: br i1 %tmp0, ; OPT: if: -; OPT: %sunkaddr = ptrtoint i8 addrspace(2)* %in to i64 -; OPT: %sunkaddr1 = add i64 %sunkaddr, 4095 -define void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +; OPT: getelementptr i8, {{.*}} 4095 +define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095 @@ -500,7 +542,141 @@ done: ret void } +; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32( +; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* +; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 +; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* +; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst +define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +entry: + %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 + %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(3)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32( +; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* +; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 +; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* +; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic +define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +entry: + %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 + %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic + %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0 + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(3)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32( +; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 +; OPT: br i1 +; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic +define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) { +entry: + %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999 + %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic + %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0 + br label %endif + +endif: + %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ] + store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32( +; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* +; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 +; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* +; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false) +define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +entry: + %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 + %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false) + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(3)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32( +; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* +; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 +; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* +; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false) +define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +entry: + %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 + %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false) + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(3)* %out.gep + br label %done + +done: + ret void +} + declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +attributes #2 = { nounwind argmemonly } diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index 066ef951cc31..53adf09026ec 100644 --- a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -36,7 +36,7 @@ ; GCN: BB0_3: ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 { entry: %shr = lshr i32 %arg1, 8 br i1 undef, label %bb0, label %bb1 @@ -76,7 +76,7 @@ ret: ; OPT: ret ; GCN-LABEL: {{^}}sink_sbfe_i32: -define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 { +define amdgpu_kernel void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 { entry: %shr = ashr i32 %arg1, 8 br i1 undef, label %bb0, label %bb1 @@ -120,20 +120,21 @@ ret: ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr -; VI: s_bfe_u32 s0, s0, 0xc0004 +; VI: s_load_dword [[ARG:s[0-9]+]], s[0:1], 0x2c +; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 ; GCN: s_cbranch_scc1 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: s_and_b32 s0, s0, 0xff +; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0xff ; GCN: BB2_2: ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: s_and_b32 s0, s0, 0x7f +; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0x7f ; GCN: BB2_3: ; GCN: buffer_store_short ; GCN: s_endpgm -define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 { entry: %shr = lshr i16 %arg1, 4 br i1 undef, label %bb0, label %bb1 @@ -186,7 +187,7 @@ ret: ; GCN: BB3_3: ; GCN: buffer_store_dwordx2 -define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 { entry: %shr = lshr i64 %arg1, 30 br i1 undef, label %bb0, label %bb1 @@ -235,7 +236,7 @@ ret: ; GCN: BB4_3: ; GCN: buffer_store_dwordx2 -define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 { entry: %shr = lshr i64 %arg1, 15 br i1 undef, label %bb0, label %bb1 @@ -282,7 +283,7 @@ ret: ; GCN: BB5_3: ; GCN: buffer_store_dwordx2 -define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 { +define amdgpu_kernel void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 { entry: %shr = lshr i64 %arg1, 35 br i1 undef, label %bb0, label %bb1 diff --git a/test/CodeGen/AMDGPU/clamp-modifier.ll b/test/CodeGen/AMDGPU/clamp-modifier.ll new file mode 100644 index 000000000000..c3a7d5e14d87 --- /dev/null +++ b/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -0,0 +1,222 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}v_clamp_add_src_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN-NOT: [[A]] +; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}} +define amdgpu_kernel void @v_clamp_add_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd float %a, 1.0 + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + store float %clamp, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_multi_use_src_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[ADD]], [[ADD]] clamp{{$}} +define amdgpu_kernel void @v_clamp_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd float %a, 1.0 + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + store float %clamp, float addrspace(1)* %out.gep + store volatile float %add, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_clamp_dbg_use_src_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN-NOT: [[A]] +; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}} +define amdgpu_kernel void @v_clamp_dbg_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd float %a, 1.0 + call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10 + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + store float %clamp, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_add_neg_src_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_floor_f32_e32 [[FLOOR:v[0-9]+]], [[A]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[FLOOR]], -[[FLOOR]] clamp{{$}} +define amdgpu_kernel void @v_clamp_add_neg_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %floor = call float @llvm.floor.f32(float %a) + %neg.floor = fsub float -0.0, %floor + %max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + store float %clamp, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_non_clamp_max_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} +; GCN: v_max_f32_e32 v{{[0-9]+}}, 0, [[ADD]]{{$}} +define amdgpu_kernel void @v_non_clamp_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd float %a, 1.0 + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + store float %max, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_add_src_f32_denormals: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}} +define amdgpu_kernel void @v_clamp_add_src_f32_denormals(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd float %a, 1.0 + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + store float %clamp, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_add_src_f16_denorm: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; VI: v_add_f16_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}} + +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]] +define amdgpu_kernel void @v_clamp_add_src_f16_denorm(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %add = fadd half %a, 1.0 + %max = call half @llvm.maxnum.f16(half %add, half 0.0) + %clamp = call half @llvm.minnum.f16(half %max, half 1.0) + store half %clamp, half addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_add_src_f16_no_denormals: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; VI-NOT: [[A]] +; VI: v_add_f16_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}} + +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]] +define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(half addrspace(1)* %out, half addrspace(1)* %aptr) #3 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %add = fadd half %a, 1.0 + %max = call half @llvm.maxnum.f16(half %add, half 0.0) + %clamp = call half @llvm.minnum.f16(half %max, half 1.0) + store half %clamp, half addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_add_src_v2f32: +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}} +; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[A]], 1.0 clamp{{$}} +; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[B]], 1.0 clamp{{$}} +define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %out, i32 %tid + %a = load <2 x float>, <2 x float> addrspace(1)* %gep0 + %add = fadd <2 x float> %a, + %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %add, <2 x float> zeroinitializer) + %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> ) + store <2 x float> %clamp, <2 x float> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_add_src_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], 1.0 clamp{{$}} +define amdgpu_kernel void @v_clamp_add_src_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %add = fadd double %a, 1.0 + %max = call double @llvm.maxnum.f64(double %add, double 0.0) + %clamp = call double @llvm.minnum.f64(double %max, double 1.0) + store double %clamp, double addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_mac_to_mad: +; GCN: v_mad_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]}} clamp{{$}} +define amdgpu_kernel void @v_clamp_mac_to_mad(float addrspace(1)* %out, float addrspace(1)* %aptr, float %a) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %b = load float, float addrspace(1)* %gep0 + + %mul = fmul float %a, %a + %add = fadd float %mul, %b + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + %res = fadd float %clamp, %b + store float %res, float addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.floor.f32(float) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 +declare double @llvm.fabs.f64(double) #1 +declare double @llvm.minnum.f64(double, double) #1 +declare double @llvm.maxnum.f64(double, double) #1 +declare half @llvm.fabs.f16(half) #1 +declare half @llvm.minnum.f16(half, half) #1 +declare half @llvm.maxnum.f16(half, half) #1 +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1 +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1 +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "target-features"="+fp32-denormals" } +attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug) +!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1) +!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!6 = !DISubroutineType(types: !7) +!7 = !{null, !8} +!8 = !DIBasicType(name: "float", size: 32, align: 32) +!9 = !DIExpression() +!10 = !DILocation(line: 1, column: 42, scope: !5) diff --git a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir new file mode 100644 index 000000000000..fbfd0fbf9308 --- /dev/null +++ b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir @@ -0,0 +1,424 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s +--- | + define amdgpu_ps void @v_max_self_clamp_not_set_f32() #0 { + ret void + } + + define amdgpu_ps void @v_clamp_omod_already_set_f32() #0 { + ret void + } + + define amdgpu_ps void @v_omod_mul_omod_already_set_f32() #0 { + ret void + } + + define amdgpu_ps void @v_omod_mul_clamp_already_set_f32() #0 { + ret void + } + + define amdgpu_ps void @v_omod_add_omod_already_set_f32() #0 { + ret void + } + + define amdgpu_ps void @v_omod_add_clamp_already_set_f32() #0 { + ret void + } + + attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } + +... +--- +# GCN-LABEL: name: v_max_self_clamp_not_set_f32 +# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec +# GCN-NEXT: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec + +name: v_max_self_clamp_not_set_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vgpr_32 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vreg_64 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %25 = REG_SEQUENCE %3, 1, %24, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %14 = S_MOV_B32 2 + %26 = V_LSHL_B64 killed %25, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %18 = COPY %26 + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec + %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec + %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... +--- +# GCN-LABEL: name: v_clamp_omod_already_set_f32 +# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec +# GCN: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec +name: v_clamp_omod_already_set_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vgpr_32 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vreg_64 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %25 = REG_SEQUENCE %3, 1, %24, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %14 = S_MOV_B32 2 + %26 = V_LSHL_B64 killed %25, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %18 = COPY %26 + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec + %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec + %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM +... +--- +# Don't fold a mul that looks like an omod if itself has omod set + +# GCN-LABEL: name: v_omod_mul_omod_already_set_f32 +# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec +# GCN-NEXT: %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit %exec +name: v_omod_mul_omod_already_set_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vgpr_32 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vreg_64 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %25 = REG_SEQUENCE %3, 1, %24, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %14 = S_MOV_B32 2 + %26 = V_LSHL_B64 killed %25, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %18 = COPY %26 + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec + %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec + %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit %exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... +--- +# Don't fold a mul that looks like an omod if itself has clamp set +# This might be OK, but would require folding the clamp at the same time. +# GCN-LABEL: name: v_omod_mul_clamp_already_set_f32 +# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec +# GCN-NEXT: %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit %exec + +name: v_omod_mul_clamp_already_set_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vgpr_32 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vreg_64 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %25 = REG_SEQUENCE %3, 1, %24, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %14 = S_MOV_B32 2 + %26 = V_LSHL_B64 killed %25, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %18 = COPY %26 + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec + %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec + %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit %exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... + + + + + + + + + + + + + +--- +# Don't fold a mul that looks like an omod if itself has omod set + +# GCN-LABEL: name: v_omod_add_omod_already_set_f32 +# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec +# GCN-NEXT: %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit %exec +name: v_omod_add_omod_already_set_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vgpr_32 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vreg_64 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %25 = REG_SEQUENCE %3, 1, %24, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %14 = S_MOV_B32 2 + %26 = V_LSHL_B64 killed %25, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %18 = COPY %26 + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec + %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec + %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit %exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... +--- +# Don't fold a mul that looks like an omod if itself has clamp set +# This might be OK, but would require folding the clamp at the same time. +# GCN-LABEL: name: v_omod_add_clamp_already_set_f32 +# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec +# GCN-NEXT: %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit %exec + +name: v_omod_add_clamp_already_set_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: sgpr_128 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vgpr_32 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vreg_64 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vreg_64 } + - { id: 26, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %25 = REG_SEQUENCE %3, 1, %24, 2 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + %12 = REG_SEQUENCE killed %11, 1, killed %10, 2 + %13 = REG_SEQUENCE killed %5, 17, %12, 18 + %14 = S_MOV_B32 2 + %26 = V_LSHL_B64 killed %25, 2, implicit %exec + %16 = REG_SEQUENCE killed %4, 17, %12, 18 + %18 = COPY %26 + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec + %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec + %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit %exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll new file mode 100644 index 000000000000..9735c7074be2 --- /dev/null +++ b/test/CodeGen/AMDGPU/clamp.ll @@ -0,0 +1,529 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}v_clamp_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_neg_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %fneg.a = fsub float -0.0, %a + %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_negabs_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} +define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %fabs.a = call float @llvm.fabs.f32(float %a) + %fneg.fabs.a = fsub float -0.0, %fabs.a + + %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_negzero_f32: +; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1 +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0 +define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float -0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] +define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + store volatile float %max, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_clamp_f16: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} + +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] +define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %max = call half @llvm.maxnum.f16(half %a, half 0.0) + %med = call half @llvm.minnum.f16(half %max, half 1.0) + + store half %med, half addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_neg_f16: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} + +; FIXME: Better to fold neg into max +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] +define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %fneg.a = fsub half -0.0, %a + %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0) + %med = call half @llvm.minnum.f16(half %max, half 1.0) + + store half %med, half addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_negabs_f16: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; VI: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} + +; FIXME: Better to fold neg/abs into max + +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] +define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %fabs.a = call half @llvm.fabs.f16(half %a) + %fneg.fabs.a = fsub half -0.0, %fabs.a + + %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0) + %med = call half @llvm.minnum.f16(half %max, half 1.0) + + store half %med, half addrspace(1)* %out.gep + ret void +} + +; FIXME: Do f64 instructions support clamp? +; GCN-LABEL: {{^}}v_clamp_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %max = call double @llvm.maxnum.f64(double %a, double 0.0) + %med = call double @llvm.minnum.f64(double %max, double 1.0) + + store double %med, double addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_neg_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %fneg.a = fsub double -0.0, %a + %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0) + %med = call double @llvm.minnum.f64(double %max, double 1.0) + + store double %med, double addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_negabs_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}} +define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %fabs.a = call double @llvm.fabs.f64(double %a) + %fneg.fabs.a = fsub double -0.0, %fabs.a + + %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0) + %med = call double @llvm.minnum.f64(double %max, double 1.0) + + store double %med, double addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 +define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_aby_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_bay_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_yab_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_yba_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_bya_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0 +define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5 +define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}} +define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float)) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_snan_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; --------------------------------------------------------------------- +; Test non-default behaviors enabling snans and disabling dx10_clamp +; --------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] +define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd nnan float %a, 1.0 + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0 +define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0 +define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0 +define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000 +define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001 +define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) + store float %med, float addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 +declare double @llvm.fabs.f64(double) #1 +declare double @llvm.minnum.f64(double, double) #1 +declare double @llvm.maxnum.f64(double, double) #1 +declare half @llvm.fabs.f16(half) #1 +declare half @llvm.minnum.f16(half, half) #1 +declare half @llvm.maxnum.f16(half, half) #1 + + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" } +attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" } +attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" } diff --git a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll index 9b4b61cf728a..208d97feb642 100644 --- a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll +++ b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll @@ -8,7 +8,7 @@ declare i1 @llvm.amdgcn.class.f32(float, i32) ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}} -define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) { +define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) { bb0: %tmp = icmp sgt i32 %arg1, 4 %c = icmp eq i32 %arg3, 0 @@ -35,7 +35,7 @@ bb2: ; GCN-NOT: vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}} -define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { +define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { bb0: %tmp = icmp sgt i32 %arg1, 4 %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef) diff --git a/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll b/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll index 7ff133b86e72..ef1b3d25f883 100644 --- a/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll +++ b/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll @@ -1,5 +1,4 @@ -; RUN: llc -march=amdgcn < %s | FileCheck %s -; REQUIRES: asserts +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s ; ; This testcase used to cause the following crash: ; @@ -18,14 +17,16 @@ ; ; Test for a valid output: ; CHECK: image_sample_c_d_o - -target triple = "amdgcn--" - define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 { main_body: - %tmp = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg6, <2 x i32> %arg8) - %tmp23 = fadd float %tmp, 0xBFA99999A0000000 - %tmp24 = fadd float %tmp, 0x3FA99999A0000000 + %i.i = extractelement <2 x i32> %arg8, i32 0 + %j.i = extractelement <2 x i32> %arg8, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 3, i32 0, i32 %arg6) #1 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 3, i32 0, i32 %arg6) #1 + %tmp23 = fadd float %p2.i, 0xBFA99999A0000000 + %tmp24 = fadd float %p2.i, 0x3FA99999A0000000 %tmp25 = bitcast float %tmp23 to i32 %tmp26 = insertelement <16 x i32> , i32 %tmp25, i32 1 %tmp27 = insertelement <16 x i32> %tmp26, i32 undef, i32 2 @@ -35,7 +36,8 @@ main_body: %tmp31 = insertelement <16 x i32> %tmp30, i32 undef, i32 6 %tmp32 = insertelement <16 x i32> %tmp31, i32 undef, i32 7 %tmp33 = insertelement <16 x i32> %tmp32, i32 undef, i32 8 - %tmp34 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp33, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %tmp33.bc = bitcast <16 x i32> %tmp33 to <16 x float> + %tmp34 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp33.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true) %tmp35 = extractelement <4 x float> %tmp34, i32 0 %tmp36 = bitcast float %tmp24 to i32 %tmp37 = insertelement <16 x i32> , i32 %tmp36, i32 1 @@ -46,7 +48,8 @@ main_body: %tmp42 = insertelement <16 x i32> %tmp41, i32 undef, i32 6 %tmp43 = insertelement <16 x i32> %tmp42, i32 undef, i32 7 %tmp44 = insertelement <16 x i32> %tmp43, i32 undef, i32 8 - %tmp45 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp44, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %tmp44.bc = bitcast <16 x i32> %tmp44 to <16 x float> + %tmp45 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp44.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true) %tmp46 = extractelement <4 x float> %tmp45, i32 0 %tmp47 = fmul float %tmp35, %tmp46 %tmp48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp47, 14 @@ -54,9 +57,10 @@ main_body: ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp49 } -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" } +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll index 4c7875c3a039..3e1b76a1df09 100644 --- a/test/CodeGen/AMDGPU/coalescer_remat.ll +++ b/test/CodeGen/AMDGPU/coalescer_remat.ll @@ -13,7 +13,7 @@ declare float @llvm.fma.f32(float, float, float) ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; It's probably OK if this is slightly higher: ; CHECK: ; NumVgprs: 8 -define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { +define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { entry: %cmpflag = icmp eq i32 %flag, 1 br i1 %cmpflag, label %loop, label %exit diff --git a/test/CodeGen/AMDGPU/code-object-metadata-deduce-ro-arg.ll b/test/CodeGen/AMDGPU/code-object-metadata-deduce-ro-arg.ll new file mode 100644 index 000000000000..a33c3646e253 --- /dev/null +++ b/test/CodeGen/AMDGPU/code-object-metadata-deduce-ro-arg.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck %s + +; CHECK: - Name: test_ro_arg +; CHECK: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: F32 +; CHECK-NEXT: AccQual: ReadOnly +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: IsConst: true +; CHECK-NEXT: IsRestrict: true +; CHECK-NEXT: TypeName: 'float*' + +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: F32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: 'float*' + +define amdgpu_kernel void @test_ro_arg(float addrspace(1)* noalias readonly %in, float addrspace(1)* %out) + !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2 + !kernel_arg_base_type !2 !kernel_arg_type_qual !3 { + ret void +} + +!0 = !{i32 1, i32 1} +!1 = !{!"none", !"none"} +!2 = !{!"float*", !"float*"} +!3 = !{!"const restrict", !""} + diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll new file mode 100644 index 000000000000..88ba310a92ca --- /dev/null +++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll @@ -0,0 +1,1260 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-comd -amdgpu-verify-comd -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -amdgpu-dump-comd -amdgpu-verify-comd -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-comd -amdgpu-verify-comd -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +%struct.A = type { i8, float } +%opencl.image1d_t = type opaque +%opencl.image2d_t = type opaque +%opencl.image3d_t = type opaque +%opencl.queue_t = type opaque +%opencl.pipe_t = type opaque +%struct.B = type { i32 addrspace(1)*} +%opencl.clk_event_t = type opaque + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] +; CHECK: Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ] +; CHECK: Kernels: + +; CHECK: - Name: test_char +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_char(i8 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_ushort2 +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: U16 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: ushort2 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_ushort2(<2 x i16> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10 + !kernel_arg_base_type !10 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_int3 +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 16 +; CHECK-NEXT: Align: 16 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int3 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_int3(<3 x i32> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11 + !kernel_arg_base_type !11 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_ulong4 +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 32 +; CHECK-NEXT: Align: 32 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: U64 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: ulong4 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_ulong4(<4 x i64> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12 + !kernel_arg_base_type !12 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_half8 +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 16 +; CHECK-NEXT: Align: 16 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: F16 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: half8 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_half8(<8 x half> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13 + !kernel_arg_base_type !13 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_float16 +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 64 +; CHECK-NEXT: Align: 64 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: F32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: float16 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_float16(<16 x float> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14 + !kernel_arg_base_type !14 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_double16 +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 128 +; CHECK-NEXT: Align: 128 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: F64 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: double16 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_double16(<16 x double> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15 + !kernel_arg_base_type !15 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_pointer +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16 + !kernel_arg_base_type !16 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_image +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: Image +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: image2d_t +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17 + !kernel_arg_base_type !17 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_sampler +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: Sampler +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: sampler_t +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_sampler(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18 + !kernel_arg_base_type !18 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_queue +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: Queue +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: queue_t +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19 + !kernel_arg_base_type !19 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_struct +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Private +; CHECK-NEXT: TypeName: struct A +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_struct(%struct.A* byval %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_i128 +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 16 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: i128 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_i128(i128 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21 + !kernel_arg_base_type !21 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_multi_arg +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I16 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: short2 +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: char3 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c) + !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24 + !kernel_arg_base_type !24 !kernel_arg_type_qual !25 { + ret void +} + +; CHECK: - Name: test_addr_space +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Constant +; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: DynamicSharedPointer +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: PointeeAlign: 4 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Local +; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g, + i32 addrspace(2)* %c, + i32 addrspace(3)* %l) + !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51 + !kernel_arg_base_type !51 !kernel_arg_type_qual !25 { + ret void +} + +; CHECK: - Name: test_type_qual +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: IsVolatile: true +; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: IsConst: true +; CHECK-NEXT: IsRestrict: true +; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: Pipe +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: IsPipe: true +; CHECK-NEXT: TypeName: 'int *' +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a, + i32 addrspace(1)* %b, + %opencl.pipe_t addrspace(1)* %c) + !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51 + !kernel_arg_base_type !51 !kernel_arg_type_qual !70 { + ret void +} + +; CHECK: - Name: test_access_qual +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: Image +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: ReadOnly +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: image1d_t +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: Image +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: WriteOnly +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: image2d_t +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: Image +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: ReadWrite +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: image3d_t +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro, + %opencl.image2d_t addrspace(1)* %wo, + %opencl.image3d_t addrspace(1)* %rw) + !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62 + !kernel_arg_base_type !62 !kernel_arg_type_qual !25 { + ret void +} + +; CHECK: - Name: test_vec_type_hint_half +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: VecTypeHint: half +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_vec_type_hint_half(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !26 { + ret void +} + +; CHECK: - Name: test_vec_type_hint_float +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: VecTypeHint: float +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_vec_type_hint_float(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !27 { + ret void +} + +; CHECK: - Name: test_vec_type_hint_double +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: VecTypeHint: double +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_vec_type_hint_double(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !28 { + ret void +} + +; CHECK: - Name: test_vec_type_hint_char +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: VecTypeHint: char +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_vec_type_hint_char(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !29 { + ret void +} + +; CHECK: - Name: test_vec_type_hint_short +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: VecTypeHint: short +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_vec_type_hint_short(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !30 { + ret void +} + +; CHECK: - Name: test_vec_type_hint_long +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: VecTypeHint: long +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_vec_type_hint_long(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !31 { + ret void +} + +; CHECK: - Name: test_vec_type_hint_unknown +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: VecTypeHint: unknown +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_vec_type_hint_unknown(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !32 { + ret void +} + +; CHECK: - Name: test_reqd_wgs_vec_type_hint +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: ReqdWorkGroupSize: [ 1, 2, 4 ] +; CHECK-NEXT: VecTypeHint: int +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5 + !reqd_work_group_size !6 { + ret void +} + +; CHECK: - Name: test_wgs_hint_vec_type_hint +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: WorkGroupSizeHint: [ 8, 16, 32 ] +; CHECK-NEXT: VecTypeHint: uint4 +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: int +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7 + !work_group_size_hint !8 { + ret void +} + +; CHECK: - Name: test_arg_ptr_to_ptr +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: 'int **' +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_arg_ptr_to_ptr(i32* addrspace(1)* %a) + !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80 + !kernel_arg_base_type !80 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_arg_struct_contains_ptr +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Private +; CHECK-NEXT: TypeName: struct B +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B* byval %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82 + !kernel_arg_base_type !82 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_arg_vector_of_ptr +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 16 +; CHECK-NEXT: Align: 16 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I32 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: 'global int* __attribute__((ext_vector_type(2)))' +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83 + !kernel_arg_base_type !83 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_arg_unknown_builtin_type +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: clk_event_t +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_arg_unknown_builtin_type( + %opencl.clk_event_t addrspace(1)* %a) + !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84 + !kernel_arg_base_type !84 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_pointee_align +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: GlobalBuffer +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: TypeName: 'long *' +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: DynamicSharedPointer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: PointeeAlign: 1 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Local +; CHECK-NEXT: TypeName: 'char *' +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: DynamicSharedPointer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: PointeeAlign: 2 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Local +; CHECK-NEXT: TypeName: 'char2 *' +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: DynamicSharedPointer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: PointeeAlign: 4 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Local +; CHECK-NEXT: TypeName: 'char3 *' +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: DynamicSharedPointer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: PointeeAlign: 4 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Local +; CHECK-NEXT: TypeName: 'char4 *' +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: DynamicSharedPointer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: PointeeAlign: 8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Local +; CHECK-NEXT: TypeName: 'char8 *' +; CHECK-NEXT: - Size: 4 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: DynamicSharedPointer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: PointeeAlign: 16 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: AddrSpaceQual: Local +; CHECK-NEXT: TypeName: 'char16 *' +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, + i8 addrspace(3)* %b, + <2 x i8> addrspace(3)* %c, + <3 x i8> addrspace(3)* %d, + <4 x i8> addrspace(3)* %e, + <8 x i8> addrspace(3)* %f, + <16 x i8> addrspace(3)* %g) + !kernel_arg_addr_space !91 !kernel_arg_access_qual !92 !kernel_arg_type !93 + !kernel_arg_base_type !93 !kernel_arg_type_qual !94 { + ret void +} + +!llvm.printf.fmts = !{!100, !101} + +!1 = !{i32 0} +!2 = !{!"none"} +!3 = !{!"int"} +!4 = !{!""} +!5 = !{i32 undef, i32 1} +!6 = !{i32 1, i32 2, i32 4} +!7 = !{<4 x i32> undef, i32 0} +!8 = !{i32 8, i32 16, i32 32} +!9 = !{!"char"} +!10 = !{!"ushort2"} +!11 = !{!"int3"} +!12 = !{!"ulong4"} +!13 = !{!"half8"} +!14 = !{!"float16"} +!15 = !{!"double16"} +!16 = !{!"int *"} +!17 = !{!"image2d_t"} +!18 = !{!"sampler_t"} +!19 = !{!"queue_t"} +!20 = !{!"struct A"} +!21 = !{!"i128"} +!22 = !{i32 0, i32 0, i32 0} +!23 = !{!"none", !"none", !"none"} +!24 = !{!"int", !"short2", !"char3"} +!25 = !{!"", !"", !""} +!26 = !{half undef, i32 1} +!27 = !{float undef, i32 1} +!28 = !{double undef, i32 1} +!29 = !{i8 undef, i32 1} +!30 = !{i16 undef, i32 1} +!31 = !{i64 undef, i32 1} +!32 = !{i32 *undef, i32 1} +!50 = !{i32 1, i32 2, i32 3} +!51 = !{!"int *", !"int *", !"int *"} +!60 = !{i32 1, i32 1, i32 1} +!61 = !{!"read_only", !"write_only", !"read_write"} +!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"} +!70 = !{!"volatile", !"const restrict", !"pipe"} +!80 = !{!"int **"} +!81 = !{i32 1} +!82 = !{!"struct B"} +!83 = !{!"global int* __attribute__((ext_vector_type(2)))"} +!84 = !{!"clk_event_t"} +!opencl.ocl.version = !{!90} +!90 = !{i32 2, i32 0} +!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3} +!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"} +!93 = !{!"long *", !"char *", !"char2 *", !"char3 *", !"char4 *", !"char8 *", !"char16 *"} +!94 = !{!"", !"", !"", !"", !"", !"", !""} +!100 = !{!"1:1:4:%d\5Cn"} +!101 = !{!"2:1:8:%g\5Cn"} + +; NOTES: Displaying notes found at file offset 0x{{[0-9]+}} +; NOTES-NEXT: Owner Data size Description +; NOTES-NEXT: AMD 0x00000008 Unknown note type: (0x00000001) +; NOTES-NEXT: AMD 0x0000001b Unknown note type: (0x00000003) +; GFX700: AMD 0x00009171 Unknown note type: (0x0000000a) +; GFX800: AMD 0x00009190 Unknown note type: (0x0000000a) +; GFX900: AMD 0x00009171 Unknown note type: (0x0000000a) + +; PARSER: AMDGPU Code Object Metadata Parser Test: PASS diff --git a/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll new file mode 100644 index 000000000000..f41da9f92136 --- /dev/null +++ b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll @@ -0,0 +1,9 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata | FileCheck %s + +; Make sure llc does not crash for invalid opencl version metadata. + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] +; CHECK: ... + +!opencl.ocl.version = !{} diff --git a/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll new file mode 100644 index 000000000000..0509663d9849 --- /dev/null +++ b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll @@ -0,0 +1,10 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata | FileCheck %s + +; Make sure llc does not crash for invalid opencl version metadata. + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] +; CHECK: ... + +!opencl.ocl.version = !{!0} +!0 = !{} diff --git a/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll new file mode 100644 index 000000000000..7404cec5d78a --- /dev/null +++ b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll @@ -0,0 +1,10 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata | FileCheck %s + +; Make sure llc does not crash for invalid opencl version metadata. + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] +; CHECK: ... + +!opencl.ocl.version = !{!0} +!0 = !{i32 1} diff --git a/test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll b/test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll new file mode 100644 index 000000000000..3b232e40cf25 --- /dev/null +++ b/test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] + +; CHECK: Kernels: +; CHECK: - Name: test +; CHECK: CodeProps: +; CHECK: KernargSegmentSize: 24 +; GFX700: WavefrontNumSGPRs: 6 +; GFX800: WavefrontNumSGPRs: 96 +; GFX900: WavefrontNumSGPRs: 6 +; GFX700: WorkitemNumVGPRs: 4 +; GFX800: WorkitemNumVGPRs: 6 +; GFX900: WorkitemNumVGPRs: 6 +; CHECK: KernargSegmentAlign: 4 +; CHECK: GroupSegmentAlign: 4 +; CHECK: PrivateSegmentAlign: 4 +; CHECK: WavefrontSize: 6 +define amdgpu_kernel void @test( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fadd half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} diff --git a/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll new file mode 100644 index 000000000000..801029be8cb9 --- /dev/null +++ b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll @@ -0,0 +1,67 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s + +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] + +; CHECK: Kernels: +; CHECK: - Name: test +; CHECK: DebugProps: +; CHECK: DebuggerABIVersion: [ 1, 0 ] +; CHECK: ReservedNumVGPRs: 4 +; CHECK: ReservedFirstVGPR: 11 +; CHECK: PrivateSegmentBufferSGPR: 0 +; CHECK: WavefrontPrivateSegmentOffsetSGPR: 11 +define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !7 !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !15 { +entry: + %A.addr = alloca i32 addrspace(1)*, align 4 + store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !16, metadata !17), !dbg !18 + %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !19 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 0, !dbg !19 + store i32 777, i32 addrspace(1)* %arrayidx, align 4, !dbg !20 + %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !21 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 1, !dbg !21 + store i32 888, i32 addrspace(1)* %arrayidx1, align 4, !dbg !22 + %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !23 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 2, !dbg !23 + store i32 999, i32 addrspace(1)* %arrayidx2, align 4, !dbg !24 + ret void, !dbg !25 +} + +attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.dbg.cu = !{!0} +!opencl.ocl.version = !{!3} +!llvm.module.flags = !{!4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "code-object-metadata-kernel-debug-props.cl", directory: "/some/random/directory") +!2 = !{} +!3 = !{i32 1, i32 0} +!4 = !{i32 2, !"Dwarf Version", i32 2} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !{!"clang version 5.0.0"} +!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!12 = !{i32 1} +!13 = !{!"none"} +!14 = !{!"int*"} +!15 = !{!""} +!16 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!17 = !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef) +!18 = !DILocation(line: 1, column: 30, scope: !7) +!19 = !DILocation(line: 2, column: 3, scope: !7) +!20 = !DILocation(line: 2, column: 8, scope: !7) +!21 = !DILocation(line: 3, column: 3, scope: !7) +!22 = !DILocation(line: 3, column: 8, scope: !7) +!23 = !DILocation(line: 4, column: 3, scope: !7) +!24 = !DILocation(line: 4, column: 8, scope: !7) +!25 = !DILocation(line: 5, column: 1, scope: !7) diff --git a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll index 585172092676..155de5353bcb 100644 --- a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll +++ b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll @@ -8,7 +8,7 @@ ; SI-LLC-LABEL: {{^}}test: ; SI-LLC: s_mul_i32 ; SI-LLC-NOT: mul -define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) { +define amdgpu_kernel void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) { entry: %0 = mul nsw i32 %a, 3 %1 = sext i32 %0 to i64 diff --git a/test/CodeGen/AMDGPU/combine_vloads.ll b/test/CodeGen/AMDGPU/combine_vloads.ll index 01572afa6205..f8d4e01085c2 100644 --- a/test/CodeGen/AMDGPU/combine_vloads.ll +++ b/test/CodeGen/AMDGPU/combine_vloads.ll @@ -12,7 +12,7 @@ ; EG-LABEL: {{^}}combine_vloads: ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind { +define amdgpu_kernel void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind { entry: br label %for.body diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll index a4c51b233f41..973c4544d97a 100644 --- a/test/CodeGen/AMDGPU/commute-compares.ll +++ b/test/CodeGen/AMDGPU/commute-compares.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}commute_eq_64_i32: ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -21,7 +21,7 @@ define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 ; GCN-LABEL: {{^}}commute_ne_64_i32: ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -36,7 +36,7 @@ define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 ; GCN-LABEL: {{^}}commute_ne_litk_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}} -define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -49,7 +49,7 @@ define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; GCN-LABEL: {{^}}commute_ugt_64_i32: ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -62,7 +62,7 @@ define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_uge_64_i32: ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}} -define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -75,7 +75,7 @@ define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_ult_64_i32: ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -88,7 +88,7 @@ define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_ule_63_i32: ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -104,7 +104,7 @@ define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_ule_64_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}} ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}} -define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -117,7 +117,7 @@ define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_sgt_neg1_i32: ; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}} -define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -130,7 +130,7 @@ define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; GCN-LABEL: {{^}}commute_sge_neg2_i32: ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}} -define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -143,7 +143,7 @@ define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; GCN-LABEL: {{^}}commute_slt_neg16_i32: ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}} -define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -156,7 +156,7 @@ define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in ; GCN-LABEL: {{^}}commute_sle_5_i32: ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}} -define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -173,7 +173,7 @@ define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 ; GCN-LABEL: {{^}}commute_eq_64_i64: ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -186,7 +186,7 @@ define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 ; GCN-LABEL: {{^}}commute_ne_64_i64: ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -199,7 +199,7 @@ define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 ; GCN-LABEL: {{^}}commute_ugt_64_i64: ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -212,7 +212,7 @@ define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_uge_64_i64: ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -225,7 +225,7 @@ define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_ult_64_i64: ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -238,7 +238,7 @@ define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_ule_63_i64: ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -254,7 +254,7 @@ define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_ule_64_i64: ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}} ; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -267,7 +267,7 @@ define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) # ; GCN-LABEL: {{^}}commute_sgt_neg1_i64: ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -280,7 +280,7 @@ define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) ; GCN-LABEL: {{^}}commute_sge_neg2_i64: ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -293,7 +293,7 @@ define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) ; GCN-LABEL: {{^}}commute_slt_neg16_i64: ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -306,7 +306,7 @@ define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in ; GCN-LABEL: {{^}}commute_sle_5_i64: ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -324,7 +324,7 @@ define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32: ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -338,7 +338,7 @@ define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_ogt_2.0_f32: ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -351,7 +351,7 @@ define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_oge_2.0_f32: ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -364,7 +364,7 @@ define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_olt_2.0_f32: ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -377,7 +377,7 @@ define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_ole_2.0_f32: ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -390,7 +390,7 @@ define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_one_2.0_f32: ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -403,7 +403,7 @@ define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_ord_2.0_f32: ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] -define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -416,7 +416,7 @@ define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_ueq_2.0_f32: ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -429,7 +429,7 @@ define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_ugt_2.0_f32: ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -442,7 +442,7 @@ define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_uge_2.0_f32: ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -455,7 +455,7 @@ define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_ult_2.0_f32: ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -468,7 +468,7 @@ define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_ule_2.0_f32: ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -481,7 +481,7 @@ define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_une_2.0_f32: ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -494,7 +494,7 @@ define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_uno_2.0_f32: ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] -define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -512,7 +512,7 @@ define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in ; GCN-LABEL: {{^}}commute_oeq_2.0_f64: ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -526,7 +526,7 @@ define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_ogt_2.0_f64: ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -539,7 +539,7 @@ define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_oge_2.0_f64: ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -552,7 +552,7 @@ define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_olt_2.0_f64: ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -565,7 +565,7 @@ define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_ole_2.0_f64: ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -578,7 +578,7 @@ define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_one_2.0_f64: ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -591,7 +591,7 @@ define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_ord_2.0_f64: ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] -define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -604,7 +604,7 @@ define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_ueq_2.0_f64: ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -617,7 +617,7 @@ define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_ugt_2.0_f64: ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -630,7 +630,7 @@ define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_uge_2.0_f64: ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -643,7 +643,7 @@ define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_ult_2.0_f64: ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -656,7 +656,7 @@ define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_ule_2.0_f64: ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -669,7 +669,7 @@ define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_une_2.0_f64: ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -682,7 +682,7 @@ define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_uno_2.0_f64: ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] -define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -701,9 +701,9 @@ define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_frameindex: ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}} -define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 { +define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 { entry: %stack0 = alloca i32 %ptr0 = load volatile i32*, i32* addrspace(1)* undef diff --git a/test/CodeGen/AMDGPU/commute-shifts.ll b/test/CodeGen/AMDGPU/commute-shifts.ll index 862f236514ca..84d8bf2bd706 100644 --- a/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/test/CodeGen/AMDGPU/commute-shifts.ll @@ -4,10 +4,10 @@ ; GCN-LABEL: {{^}}main: ; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1 -define amdgpu_ps void @main(float %arg0, float %arg1) #0 { +define amdgpu_ps float @main(float %arg0, float %arg1) #0 { bb: %tmp = fptosi float %arg0 to i32 - %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp1 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false) %tmp2.f = extractelement <4 x float> %tmp1, i32 0 %tmp2 = bitcast float %tmp2.f to i32 %tmp3 = and i32 %tmp, 7 @@ -15,15 +15,14 @@ bb: %tmp5 = and i32 %tmp2, %tmp4 %tmp6 = icmp eq i32 %tmp5, 0 %tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1 - %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7) - %tmp9 = bitcast i32 %tmp8 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp9, float undef, float %tmp9) - ret void + %tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7) + %tmp9 = bitcast <2 x half> %tmp8 to float + ret float %tmp9 } -declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare i32 @llvm.SI.packf16(float, float) #1 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll index ed4ec82eb3e3..8820e4fd80e5 100644 --- a/test/CodeGen/AMDGPU/commute_modifiers.ll +++ b/test/CodeGen/AMDGPU/commute_modifiers.ll @@ -8,7 +8,7 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, 2.0 ; SI: buffer_store_dword [[REG]] -define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %x = load float, float addrspace(1)* %gep.0 @@ -22,7 +22,7 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace( ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -4.0 ; SI: buffer_store_dword [[REG]] -define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %x = load float, float addrspace(1)* %gep.0 @@ -37,7 +37,7 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]] ; SI: buffer_store_dword [[REG]] -define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %x = load float, float addrspace(1)* %gep.0 @@ -53,7 +53,7 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace( ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]| ; SI: buffer_store_dword [[REG]] -define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %x = load float, float addrspace(1)* %gep.0 @@ -68,7 +68,7 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace( ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]| ; SI: buffer_store_dword [[REG]] -define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -85,7 +85,7 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]] ; SI: buffer_store_dword [[REG]] -define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -102,7 +102,7 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]| ; SI: buffer_store_dword [[REG]] -define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -121,7 +121,7 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]| ; SI: buffer_store_dword [[REG]] -define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -139,7 +139,7 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]| ; SI: buffer_store_dword [[REG]] -define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -161,7 +161,7 @@ define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]| ; SI: buffer_store_dword [[RESULT]] -define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 diff --git a/test/CodeGen/AMDGPU/concat_vectors.ll b/test/CodeGen/AMDGPU/concat_vectors.ll index 2e6be5d10f09..7394842d156f 100644 --- a/test/CodeGen/AMDGPU/concat_vectors.ll +++ b/test/CodeGen/AMDGPU/concat_vectors.ll @@ -8,7 +8,7 @@ ; value if we want to ensure scratch memory is not being used. ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8 ret void @@ -17,7 +17,7 @@ define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x ; FUNC-LABEL: {{^}}test_concat_v2i32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16 ret void @@ -26,7 +26,7 @@ define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x ; FUNC-LABEL: {{^}}test_concat_v4i32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32 ret void @@ -35,7 +35,7 @@ define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x ; FUNC-LABEL: {{^}}test_concat_v8i32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64 ret void @@ -44,7 +44,7 @@ define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x ; FUNC-LABEL: {{^}}test_concat_v16i32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { +define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128 ret void @@ -53,7 +53,7 @@ define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <1 ; FUNC-LABEL: {{^}}test_concat_v1f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8 ret void @@ -62,7 +62,7 @@ define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, < ; FUNC-LABEL: {{^}}test_concat_v2f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16 ret void @@ -71,7 +71,7 @@ define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, < ; FUNC-LABEL: {{^}}test_concat_v4f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32 ret void @@ -80,7 +80,7 @@ define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, < ; FUNC-LABEL: {{^}}test_concat_v8f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64 ret void @@ -89,7 +89,7 @@ define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, ; FUNC-LABEL: {{^}}test_concat_v16f32: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { +define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128 ret void @@ -98,7 +98,7 @@ define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a ; FUNC-LABEL: {{^}}test_concat_v1i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 ret void @@ -107,7 +107,7 @@ define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, ; FUNC-LABEL: {{^}}test_concat_v2i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 ret void @@ -116,7 +116,7 @@ define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, ; FUNC-LABEL: {{^}}test_concat_v4i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 ret void @@ -125,7 +125,7 @@ define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, ; FUNC-LABEL: {{^}}test_concat_v8i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 ret void @@ -134,7 +134,7 @@ define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a ; FUNC-LABEL: {{^}}test_concat_v16i64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 ret void @@ -143,7 +143,7 @@ define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> ; FUNC-LABEL: {{^}}test_concat_v1f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 ret void @@ -152,7 +152,7 @@ define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, ; FUNC-LABEL: {{^}}test_concat_v2f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 ret void @@ -161,7 +161,7 @@ define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, ; FUNC-LABEL: {{^}}test_concat_v4f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 ret void @@ -170,7 +170,7 @@ define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, ; FUNC-LABEL: {{^}}test_concat_v8f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 ret void @@ -179,7 +179,7 @@ define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a ; FUNC-LABEL: {{^}}test_concat_v16f64: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { +define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 ret void @@ -188,7 +188,7 @@ define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> ; FUNC-LABEL: {{^}}test_concat_v1i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> store <2 x i1> %concat, <2 x i1> addrspace(1)* %out ret void @@ -197,7 +197,7 @@ define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> ; FUNC-LABEL: {{^}}test_concat_v2i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> store <4 x i1> %concat, <4 x i1> addrspace(1)* %out ret void @@ -206,7 +206,7 @@ define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> ; FUNC-LABEL: {{^}}test_concat_v4i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> store <8 x i1> %concat, <8 x i1> addrspace(1)* %out ret void @@ -215,7 +215,7 @@ define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> ; FUNC-LABEL: {{^}}test_concat_v8i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> store <16 x i1> %concat, <16 x i1> addrspace(1)* %out ret void @@ -224,7 +224,7 @@ define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1 ; FUNC-LABEL: {{^}}test_concat_v16i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> store <32 x i1> %concat, <32 x i1> addrspace(1)* %out ret void @@ -233,7 +233,7 @@ define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x ; FUNC-LABEL: {{^}}test_concat_v32i1: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { +define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> store <64 x i1> %concat, <64 x i1> addrspace(1)* %out ret void @@ -242,7 +242,7 @@ define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x ; FUNC-LABEL: {{^}}test_concat_v1i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4 ret void @@ -251,7 +251,7 @@ define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x ; FUNC-LABEL: {{^}}test_concat_v2i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8 ret void @@ -260,7 +260,7 @@ define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x ; FUNC-LABEL: {{^}}test_concat_v4i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16 ret void @@ -269,7 +269,7 @@ define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x ; FUNC-LABEL: {{^}}test_concat_v8i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32 ret void @@ -278,7 +278,7 @@ define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x ; FUNC-LABEL: {{^}}test_concat_v16i16: ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 ; SI-NOT: movrel -define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { +define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64 ret void @@ -286,7 +286,7 @@ define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <1 ; FUNC-LABEL: {{^}}concat_vector_crash: ; SI: s_endpgm -define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { +define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { bb: %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4 %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index 34bb2588ad62..62b47beb1251 100644 --- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -1,12 +1,12 @@ # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s --- | - define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %and = and i32 %a, 1234567 store volatile i32 %and, i32 addrspace(1)* %out ret void } - define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom @@ -17,13 +17,13 @@ ret void } - define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %shl = shl i32 %a, 12 store volatile i32 %shl, i32 addrspace(1)* %out ret void } - define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom @@ -34,13 +34,13 @@ ret void } - define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %ashr = ashr i32 %a, 12 store volatile i32 %ashr, i32 addrspace(1)* %out ret void } - define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom @@ -51,13 +51,13 @@ ret void } - define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %lshr = lshr i32 %a, 12 store volatile i32 %lshr, i32 addrspace(1)* %out ret void } - define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { + define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll index 0ff75ab58003..0831d250b9e7 100644 --- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -5,7 +5,7 @@ ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_v_and_0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) { %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %size = call i32 @llvm.amdgcn.groupstaticsize() %and = and i32 %size, %x @@ -17,7 +17,7 @@ define void @fold_mi_v_and_0(i32 addrspace(1)* %out) { ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %and = and i32 %size, %x store i32 %and, i32 addrspace(1)* %out @@ -28,7 +28,7 @@ define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 { ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_v_or_0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) { %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %size = call i32 @llvm.amdgcn.groupstaticsize() %or = or i32 %size, %x @@ -42,7 +42,7 @@ define void @fold_mi_v_or_0(i32 addrspace(1)* %out) { ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %or = or i32 %size, %x store i32 %or, i32 addrspace(1)* %out @@ -53,7 +53,7 @@ define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 { ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) { %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %size = call i32 @llvm.amdgcn.groupstaticsize() %xor = xor i32 %size, %x @@ -67,7 +67,7 @@ define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) { ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %xor = xor i32 %size, %x store i32 %xor, i32 addrspace(1)* %out @@ -78,7 +78,7 @@ define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 { ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}} ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %xor = xor i32 %size, -1 store i32 %xor, i32 addrspace(1)* %out @@ -91,7 +91,7 @@ define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 { ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]] ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}} ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -define void @fold_mi_v_not_0(i64 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) { %vreg = load volatile i64, i64 addrspace(1)* undef %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg) %xor = xor i64 %ctpop, -1 @@ -110,7 +110,7 @@ define void @fold_mi_v_not_0(i64 addrspace(1)* %out) { ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -define void @fold_mi_or_neg1(i64 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) { %vreg0 = load volatile i64, i64 addrspace(1)* undef %vreg1 = load volatile i64, i64 addrspace(1)* undef %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0) @@ -126,7 +126,7 @@ define void @fold_mi_or_neg1(i64 addrspace(1)* %out) { ; GCN: v_not_b32 ; GCN: v_and_b32 ; GCN-NOT: v_and_b32 -define void @fold_mi_and_neg1(i64 addrspace(1)* %out) { +define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) { %vreg0 = load volatile i64, i64 addrspace(1)* undef %vreg1 = load volatile i64, i64 addrspace(1)* undef %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0) diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 13383cbc1741..d3e6c11ef908 100644 --- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -10,6 +10,8 @@ ; GCN-LABEL: {{^}}divergent_if_endif: +; VGPR: workitem_private_segment_byte_size = 12{{$}} + ; GCN: {{^}}; BB#0: ; GCN: s_mov_b32 m0, -1 @@ -26,12 +28,13 @@ ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:8 ; 4-byte Folded Spill ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill + ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} ; GCN: s_waitcnt vmcnt(0) expcnt(0) @@ -55,11 +58,11 @@ -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:8 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] @@ -69,7 +72,7 @@ ; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] -define void @divergent_if_endif(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %load0 = load volatile i32, i32 addrspace(3)* undef @@ -88,6 +91,8 @@ endif: } ; GCN-LABEL: {{^}}divergent_loop: +; VGPR: workitem_private_segment_byte_size = 16{{$}} + ; GCN: {{^}}; BB#0: ; GCN: s_mov_b32 m0, -1 @@ -100,7 +105,7 @@ endif: ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] @@ -108,9 +113,9 @@ endif: ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -120,7 +125,7 @@ endif: ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: -; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] ; GCN: v_cmp_ne_u32_e32 vcc, ; GCN: s_and_b64 vcc, exec, vcc @@ -133,11 +138,11 @@ endif: ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] @@ -145,7 +150,7 @@ endif: ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] -define void @divergent_loop(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %load0 = load volatile i32, i32 addrspace(3)* undef @@ -180,7 +185,7 @@ end: ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] @@ -237,14 +242,14 @@ end: ; GCN: BB{{[0-9]+}}_2: ; %if ; GCN: ds_read_b32 -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill ; GCN: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: [[ELSE]]: ; %else -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_waitcnt vmcnt(0) expcnt(0) @@ -267,7 +272,7 @@ end: ; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] -define void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %load0 = load volatile i32, i32 addrspace(3)* undef diff --git a/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/test/CodeGen/AMDGPU/convergent-inlineasm.ll index 755f439c6863..0074a41e44cf 100644 --- a/test/CodeGen/AMDGPU/convergent-inlineasm.ll +++ b/test/CodeGen/AMDGPU/convergent-inlineasm.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN: v_cmp_ne_u32_e64 ; GCN: ; mask branch ; GCN: BB{{[0-9]+_[0-9]+}}: -define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) { +define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) #1 @@ -29,7 +29,8 @@ bb5: ; preds = %bb3, %bb ; GCN: v_cmp_ne_u32_e64 ; GCN: BB{{[0-9]+_[0-9]+}}: -define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) { + +define amdgpu_kernel void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll index 7434d745b259..026dd7ca6c87 100644 --- a/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; GCN: buffer_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm -define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 ret void @@ -19,7 +19,7 @@ define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm -define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 @@ -32,7 +32,7 @@ define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace( ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm -define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 @@ -47,7 +47,7 @@ define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace( ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm -define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 @@ -65,7 +65,7 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace( ; GCN-DAG: buffer_store_dword ; GCN: s_endpgm -define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %add = add <4 x i8> %val, store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 @@ -85,7 +85,7 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add ; GCN: {{buffer|flat}}_store_dword ; GCN: {{buffer|flat}}_store_dword ; GCN: s_endpgm -define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 @@ -101,7 +101,7 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm -define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 ret void @@ -113,7 +113,7 @@ define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm -define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2 ret void @@ -128,7 +128,7 @@ define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 ret void @@ -141,7 +141,7 @@ define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa ; GCN: buffer_load_ubyte ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 ret void @@ -157,7 +157,7 @@ define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/copy-to-reg.ll b/test/CodeGen/AMDGPU/copy-to-reg.ll index 3422a889a520..f35b0706f3d3 100644 --- a/test/CodeGen/AMDGPU/copy-to-reg.ll +++ b/test/CodeGen/AMDGPU/copy-to-reg.ll @@ -6,7 +6,7 @@ ; Make sure this doesn't crash ; CHECK-LABEL: {{^}}copy_to_reg_frameindex: -define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %alloca = alloca [16 x i32] br label %loop diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll index 1a0027dd4a3c..e252971e3f42 100644 --- a/test/CodeGen/AMDGPU/ctlz.ll +++ b/test/CodeGen/AMDGPU/ctlz.ll @@ -27,7 +27,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; EG: FFBH_UINT ; EG: CNDE_INT -define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { +define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void @@ -43,7 +43,7 @@ define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { ; EG: FFBH_UINT ; EG: CNDE_INT -define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 @@ -61,7 +61,7 @@ define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia ; EG: CNDE_INT ; EG: FFBH_UINT ; EG: CNDE_INT -define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 @@ -89,7 +89,7 @@ define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp ; EG-DAG: FFBH_UINT ; EG-DAG: CNDE_INT -define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 @@ -98,10 +98,11 @@ define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrsp ; FUNC-LABEL: {{^}}v_ctlz_i8: ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], -; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GCN: buffer_store_byte [[RESULT]], ; GCN: s_endpgm -define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone store i8 %ctlz, i8 addrspace(1)* %out @@ -119,14 +120,14 @@ define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias % ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]] ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} -define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) store i64 %ctlz, i64 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}s_ctlz_i64_trunc: -define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) %trunc = trunc i64 %ctlz to i32 store i32 %trunc, i32 addrspace(1)* %out @@ -145,7 +146,7 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]] ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} -define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -156,7 +157,7 @@ define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia } ; FUNC-LABEL: {{^}}v_ctlz_i64_trunc: -define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -172,7 +173,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm - define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %val, 0 @@ -186,7 +187,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %val, 0 @@ -202,7 +203,7 @@ define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspac ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm -define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %ctlz, 32 @@ -217,7 +218,7 @@ define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addr ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm -define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %ctlz, 32 @@ -230,7 +231,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; GCN: {{buffer|flat}}_store_byte [[FFBH]], - define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid %val = load i8, i8 addrspace(1)* %valptr.gep @@ -245,7 +246,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr ; SI: buffer_load_ushort [[VAL:v[0-9]+]], ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; SI: buffer_store_short [[FFBH]], - define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { %val = load i16, i16 addrspace(1)* %valptr %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone %cmp = icmp eq i16 %val, 0 @@ -260,7 +261,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]] ; GCN: {{buffer|flat}}_store_byte [[TRUNC]], -define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid %val = load i7, i7 addrspace(1)* %valptr.gep diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index d390f64deeab..87ba563a740f 100644 --- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -22,7 +22,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; GCN: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { +define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void @@ -35,7 +35,7 @@ define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou ; GCN: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 @@ -51,7 +51,7 @@ define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 @@ -71,7 +71,7 @@ define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 @@ -82,7 +82,7 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_byte [[RESULT]], -define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone store i8 %ctlz, i8 addrspace(1)* %out @@ -100,14 +100,14 @@ define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1) ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]] ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} -define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) store i64 %ctlz, i64 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc: -define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) %trunc = trunc i64 %ctlz to i32 store i32 %trunc, i32 addrspace(1)* %out @@ -123,7 +123,7 @@ define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %va ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} -define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -134,7 +134,7 @@ define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc: -define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], - define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 @@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], -define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 @@ -175,7 +175,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; GCN: {{buffer|flat}}_store_byte [[FFBH]], -define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid %val = load i8, i8 addrspace(1)* %valptr.gep @@ -194,7 +194,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 ; GCN-DAG: buffer_store_dword [[RESULT0]] ; GCN-DAG: buffer_store_byte [[RESULT1]] ; GCN: s_endpgm - define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 @@ -211,7 +211,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 @@ -227,7 +227,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword -define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 @@ -243,7 +243,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 1 @@ -259,7 +259,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword -define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 1 diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll index 9692236bb363..a29e72ea57cb 100644 --- a/test/CodeGen/AMDGPU/ctpop.ll +++ b/test/CodeGen/AMDGPU/ctpop.ll @@ -16,7 +16,7 @@ declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone ; GCN: s_endpgm ; EG: BCNT_INT -define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { +define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone store i32 %ctpop, i32 addrspace(1)* %out, align 4 ret void @@ -30,7 +30,7 @@ define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone store i32 %ctpop, i32 addrspace(1)* %out, align 4 @@ -48,7 +48,7 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { +define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { %val0 = load i32, i32 addrspace(1)* %in0, align 4 %val1 = load i32, i32 addrspace(1)* %in1, align 4 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone @@ -64,7 +64,7 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { +define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { %val0 = load i32, i32 addrspace(1)* %in0, align 4 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %add = add i32 %ctpop0, %sval @@ -79,7 +79,7 @@ define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace( ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 @@ -97,7 +97,7 @@ define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrs ; EG: BCNT_INT ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 @@ -123,7 +123,7 @@ define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrs ; EG: BCNT_INT ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 @@ -165,7 +165,7 @@ define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrs ; EG: BCNT_INT ; EG: BCNT_INT ; EG: BCNT_INT -define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32 %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 @@ -179,7 +179,7 @@ define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> ad ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 4 @@ -194,7 +194,7 @@ define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 4, %ctpop @@ -209,7 +209,7 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 99999 @@ -225,7 +225,7 @@ define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspa ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, %const @@ -241,7 +241,7 @@ define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1 ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %const, %ctpop @@ -258,7 +258,7 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa ; GCN: s_endpgm ; EG: BCNT_INT -define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { +define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4 @@ -279,7 +279,7 @@ define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrsp ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT -define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { +define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { entry: %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %if, label %else diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll index cd5d805e5db3..2610684ad9ee 100644 --- a/test/CodeGen/AMDGPU/ctpop64.ll +++ b/test/CodeGen/AMDGPU/ctpop64.ll @@ -17,7 +17,7 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_dword [[VRESULT]], ; GCN: s_endpgm -define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { +define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %truncctpop = trunc i64 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 @@ -31,7 +31,7 @@ define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %val = load i64, i64 addrspace(1)* %in, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %truncctpop = trunc i64 %ctpop to i32 @@ -48,7 +48,7 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GCN: s_endpgm -define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { +define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { %val = load i64, i64 addrspace(1)* %in, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %or = or i64 %ctpop, %s.val @@ -60,7 +60,7 @@ define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* ; GCN: s_bcnt1_i32_b64 ; GCN: s_bcnt1_i32_b64 ; GCN: s_endpgm -define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind { +define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind { %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 @@ -73,7 +73,7 @@ define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) ; GCN: s_bcnt1_i32_b64 ; GCN: s_bcnt1_i32_b64 ; GCN: s_endpgm -define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind { +define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind { %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 @@ -86,7 +86,7 @@ define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) ; GCN: v_bcnt_u32_b32 ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm -define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> @@ -104,7 +104,7 @@ define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrs ; GCN: v_bcnt_u32_b32 ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm -define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> @@ -121,7 +121,7 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]] ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}} ; GCN: s_endpgm -define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { +define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { entry: %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %if, label %else @@ -146,7 +146,7 @@ endif: ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]], ; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]] ; GCN: s_endpgm -define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind { +define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind { %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone %truncctpop = trunc i128 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 @@ -159,7 +159,7 @@ define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind { ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]], ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]] ; GCN: s_endpgm -define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind { +define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind { %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone %truncctpop = trunc i65 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 @@ -181,7 +181,7 @@ define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind { ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind { %val = load i128, i128 addrspace(1)* %in, align 8 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone %truncctpop = trunc i128 %ctpop to i32 diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll index e33cc18eb05f..1fa6407647eb 100644 --- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -14,7 +14,7 @@ declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone ; SI: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { +define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, i32 addrspace(1)* %out, align 4 ret void @@ -27,7 +27,7 @@ define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou ; SI: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { %val = load i32, i32 addrspace(1)* %valptr, align 4 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, i32 addrspace(1)* %out, align 4 @@ -43,7 +43,7 @@ define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 @@ -63,7 +63,7 @@ define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { +define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 diff --git a/test/CodeGen/AMDGPU/cube.ll b/test/CodeGen/AMDGPU/cube.ll index 9b512c439b0e..7b5f1aff7ea6 100644 --- a/test/CodeGen/AMDGPU/cube.ll +++ b/test/CodeGen/AMDGPU/cube.ll @@ -6,16 +6,13 @@ declare float @llvm.amdgcn.cubesc(float, float, float) #0 declare float @llvm.amdgcn.cubetc(float, float, float) #0 declare float @llvm.amdgcn.cubema(float, float, float) #0 -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0 - - ; GCN-LABEL: {{^}}cube: ; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: _store_dwordx4 -define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 { %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c) %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c) %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c) @@ -29,18 +26,5 @@ define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) ret void } -; GCN-LABEL: {{^}}legacy_cube: -; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -; GCN: _store_dwordx4 -define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 { - %cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx) - store <4 x float> %cube, <4 x float> addrspace(1)* %out - ret void -} - attributes #0 = { nounwind readnone } attributes #1 = { nounwind } - diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 7baaa81fba59..e16daa6fad9d 100644 --- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; GCN-NOT: lshr ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dword [[CONV]], -define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %load = load i8, i8 addrspace(1)* %in, align 1 %cvt = uitofp i8 %load to float store float %cvt, float addrspace(1)* %out, align 4 @@ -22,7 +22,7 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 %cvt = uitofp <2 x i8> %load to <2 x float> store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 @@ -36,7 +36,7 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 @@ -52,7 +52,7 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 @@ -76,7 +76,7 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]] ; GCN: buffer_store_dwordx4 -define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 @@ -110,7 +110,7 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out ; GCN: {{buffer|flat}}_store_dword ; GCN: s_endpgm -define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 @@ -124,7 +124,7 @@ define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, < ; Make sure this doesn't crash. ; GCN-LABEL: {{^}}load_v7i8_to_v7f32: ; GCN: s_endpgm -define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <7 x i8> %load to <7 x float> store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 @@ -147,7 +147,7 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> ; GCN-NOT: lshr ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 @@ -159,7 +159,7 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] ; GCN: buffer_store_dword [[CONV]], -define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 2 %inreg = and i32 %add, 255 @@ -169,7 +169,7 @@ define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addr } ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: -define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %inreg = and i32 %load, 65280 %shr = lshr i32 %inreg, 8 @@ -181,7 +181,7 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr ; We don't get these ones because of the zext, but instcombine removes ; them so it shouldn't really matter. ; GCN-LABEL: {{^}}i8_zext_i32_to_f32: -define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %load = load i8, i8 addrspace(1)* %in, align 1 %ext = zext i8 %load to i32 %cvt = uitofp i32 %ext to float @@ -190,7 +190,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1 } ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: -define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %ext = zext <4 x i8> %load to <4 x i32> %cvt = uitofp <4 x i32> %ext to <4 x float> @@ -203,7 +203,7 @@ define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] -define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %and = and i32 %val, 255 %cvt = uitofp i32 %and to float @@ -216,7 +216,7 @@ define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspac ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] -define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 8 %and = and i32 %srl, 255 @@ -230,7 +230,7 @@ define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspac ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] -define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 16 %and = and i32 %srl, 255 @@ -244,7 +244,7 @@ define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspac ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] -define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 24 %and = and i32 %srl, 255 diff --git a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll index e7773c6e2a4f..c10cf1a8a6f2 100644 --- a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll +++ b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll @@ -10,7 +10,7 @@ declare float @llvm.floor.f32(float) #1 ; SI-NOT: add ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; SI: s_endpgm -define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { %floor = call float @llvm.floor.f32(float %x) #1 %cvt = fptosi float %floor to i32 store i32 %cvt, i32 addrspace(1)* %out @@ -22,7 +22,7 @@ define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]] ; SI: s_endpgm -define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 { %fadd = fadd float %x, 1.0 %floor = call float @llvm.floor.f32(float %fadd) #1 %cvt = fptosi float %floor to i32 @@ -35,7 +35,7 @@ define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 { ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}| ; SI: s_endpgm -define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { %x.fabs = call float @llvm.fabs.f32(float %x) #1 %floor = call float @llvm.floor.f32(float %x.fabs) #1 %cvt = fptosi float %floor to i32 @@ -48,7 +48,7 @@ define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} ; SI: s_endpgm -define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { %x.fneg = fsub float -0.000000e+00, %x %floor = call float @llvm.floor.f32(float %x.fneg) #1 %cvt = fptosi float %floor to i32 @@ -61,7 +61,7 @@ define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| ; SI: s_endpgm -define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { %x.fabs = call float @llvm.fabs.f32(float %x) #1 %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1 @@ -75,7 +75,7 @@ define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { ; SI: v_floor_f32 ; SI: v_cvt_u32_f32_e32 ; SI: s_endpgm -define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { %floor = call float @llvm.floor.f32(float %x) #1 %cvt = fptoui float %floor to i32 store i32 %cvt, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll index d38411dcca61..9b771ebdf7b3 100644 --- a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll +++ b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll @@ -9,7 +9,7 @@ declare float @llvm.floor.f32(float) #1 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32 ; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; SI: s_endpgm -define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 { %fadd = fadd float %x, 0.5 %floor = call float @llvm.floor.f32(float %fadd) #1 %cvt = fptosi float %floor to i32 @@ -21,7 +21,7 @@ define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 { ; SI-SAFE-NOT: v_cvt_rpi_i32_f32 ; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} ; SI: s_endpgm -define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { %x.fabs = call float @llvm.fabs.f32(float %x) #1 %fadd = fadd float %x.fabs, 0.5 %floor = call float @llvm.floor.f32(float %fadd) #1 @@ -37,7 +37,7 @@ define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] ; SI: s_endpgm -define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { %x.fneg = fsub float -0.000000e+00, %x %fadd = fadd float %x.fneg, 0.5 %floor = call float @llvm.floor.f32(float %fadd) #1 @@ -55,7 +55,7 @@ define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { ; SI-SAFE-NOT: v_cvt_flr_i32_f32 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] ; SI: s_endpgm -define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { %x.fabs = call float @llvm.fabs.f32(float %x) #1 %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs %fadd = fadd float %x.fabs.fneg, 0.5 @@ -71,7 +71,7 @@ define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { ; SI: v_floor_f32 ; SI: v_cvt_u32_f32 ; SI: s_endpgm -define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { %fadd = fadd float %x, 0.5 %floor = call float @llvm.floor.f32(float %fadd) #1 %cvt = fptoui float %floor to i32 diff --git a/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll index a32c16dfac38..11acbc274eb5 100644 --- a/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll +++ b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll @@ -9,7 +9,7 @@ ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]] ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]] -define void @store_same_base_ptr(i32 addrspace(1)* %out) { +define amdgpu_kernel void @store_same_base_ptr(i32 addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #0 %offset = sext i32 %id to i64 diff --git a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll index fb43ff4fbddd..ceff889b3a7e 100644 --- a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll +++ b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll @@ -10,7 +10,7 @@ ; CHECK: {{^}}sint: ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %sint = load i32, i32 addrspace(1) * %in @@ -24,7 +24,7 @@ entry: ;CHECK: {{^}}uint: ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %uint = load i32, i32 addrspace(1) * %in diff --git a/test/CodeGen/AMDGPU/debug.ll b/test/CodeGen/AMDGPU/debug.ll index a2e0e878b740..f149aaddb8ef 100644 --- a/test/CodeGen/AMDGPU/debug.ll +++ b/test/CodeGen/AMDGPU/debug.ll @@ -4,7 +4,7 @@ ; Test for a crash in the custom assembly dump code. ; SI: s_endpgm -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { store i32 0, i32 addrspace(1)* %out ret void } diff --git a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll index 49a7e722f29c..734905ba2b08 100644 --- a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll +++ b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll @@ -23,7 +23,7 @@ ; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR ; Function Attrs: nounwind -define void @test(i32 addrspace(1)* %A) #0 !dbg !12 { +define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 { entry: %A.addr = alloca i32 addrspace(1)*, align 4 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 diff --git a/test/CodeGen/AMDGPU/debugger-insert-nops.ll b/test/CodeGen/AMDGPU/debugger-insert-nops.ll index 6638f4e25821..fcdbfb10a8ca 100644 --- a/test/CodeGen/AMDGPU/debugger-insert-nops.ll +++ b/test/CodeGen/AMDGPU/debugger-insert-nops.ll @@ -1,27 +1,35 @@ -; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECKNOP -; CHECK: test01.cl:2:{{[0-9]+}} -; CHECK-NEXT: s_nop 0 +; This test expects that we have one instance for each line in some order with "s_nop 0" instances after each. -; CHECK: test01.cl:3:{{[0-9]+}} -; CHECK-NEXT: s_nop 0 +; Check that each line appears at least once +; CHECK-DAG: test01.cl:2:3 +; CHECK-DAG: test01.cl:3:3 +; CHECK-DAG: test01.cl:4:3 -; CHECK: test01.cl:4:{{[0-9]+}} -; CHECK-NEXT: s_nop 0 + +; Check that each of each of the lines consists of the line output, followed by "s_nop 0" +; CHECKNOP: test01.cl:{{[234]}}:3 +; CHECKNOP-NEXT: s_nop 0 +; CHECKNOP: test01.cl:{{[234]}}:3 +; CHECKNOP-NEXT: s_nop 0 +; CHECKNOP: test01.cl:{{[234]}}:3 +; CHECKNOP-NEXT: s_nop 0 ; CHECK: test01.cl:5:{{[0-9]+}} ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_endpgm ; Function Attrs: nounwind -define void @test(i32 addrspace(1)* %A) #0 !dbg !12 { +define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 { entry: %A.addr = alloca i32 addrspace(1)*, align 4 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19 %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20 %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20 - store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21 + store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !20 %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22 store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23 diff --git a/test/CodeGen/AMDGPU/debugger-reserve-regs.ll b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll index d30bb20bb03a..764c60b12bf9 100644 --- a/test/CodeGen/AMDGPU/debugger-reserve-regs.ll +++ b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll @@ -1,11 +1,12 @@ ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s ; CHECK: reserved_vgpr_first = {{[0-9]+}} ; CHECK-NEXT: reserved_vgpr_count = 4 ; CHECK: ReservedVGPRFirst: {{[0-9]+}} ; CHECK-NEXT: ReservedVGPRCount: 4 ; Function Attrs: nounwind -define void @test(i32 addrspace(1)* %A) #0 !dbg !12 { +define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 { entry: %A.addr = alloca i32 addrspace(1)*, align 4 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll index 28d065e3b32b..ad9111a28654 100644 --- a/test/CodeGen/AMDGPU/default-fp-mode.ll +++ b/test/CodeGen/AMDGPU/default-fp-mode.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}test_default_si: ; GCN: FloatMode: 192 ; GCN: IeeeMode: 1 -define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 { +define amdgpu_kernel void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -12,7 +12,7 @@ define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %ou ; GCN-LABEL: {{^}}test_default_vi: ; GCN: FloatMode: 192 ; GCN: IeeeMode: 1 -define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 { +define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -21,7 +21,7 @@ define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %ou ; GCN-LABEL: {{^}}test_f64_denormals: ; GCN: FloatMode: 192 ; GCN: IeeeMode: 1 -define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 { +define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -30,7 +30,7 @@ define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* ; GCN-LABEL: {{^}}test_f32_denormals: ; GCNL: FloatMode: 48 ; GCN: IeeeMode: 1 -define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 { +define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -39,7 +39,7 @@ define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* ; GCN-LABEL: {{^}}test_f32_f64_denormals: ; GCN: FloatMode: 240 ; GCN: IeeeMode: 1 -define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 { +define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -48,12 +48,40 @@ define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace( ; GCN-LABEL: {{^}}test_no_denormals ; GCN: FloatMode: 0 ; GCN: IeeeMode: 1 -define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 { +define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void } +; GCN-LABEL: {{^}}test_f16_f64_denormals: +; GCN: FloatMode: 192 +; GCN: IeeeMode: 1 +define amdgpu_kernel void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 { + store half 0.0, half addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}test_no_f16_f64_denormals: +; GCN: FloatMode: 0 +; GCN: IeeeMode: 1 +define amdgpu_kernel void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 { + store half 0.0, half addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}test_f32_f16_f64_denormals: +; GCN: FloatMode: 240 +; GCN: IeeeMode: 1 +define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 { + store half 0.0, half addrspace(1)* %out0 + store float 0.0, float addrspace(1)* %out1 + store double 0.0, double addrspace(1)* %out2 + ret void +} + ; GCN-LABEL: {{^}}kill_gs_const: ; GCN: IeeeMode: 0 define amdgpu_gs void @kill_gs_const() { @@ -69,22 +97,22 @@ main_body: ; GCN-LABEL: {{^}}kill_vcc_implicit_def: ; GCN: IeeeMode: 0 -define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { +define amdgpu_ps float @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { entry: %tmp0 = fcmp olt float %13, 0.0 call void @llvm.AMDGPU.kill(float %14) %tmp1 = select i1 %tmp0, float 1.0, float 0.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) - ret void + ret float %tmp1 } - declare void @llvm.AMDGPU.kill(float) -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { nounwind "target-cpu"="tahiti" } attributes #1 = { nounwind "target-cpu"="fiji" } attributes #2 = { nounwind "target-features"="+fp64-denormals" } attributes #3 = { nounwind "target-features"="+fp32-denormals" } attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" } -attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" } +attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" } +attributes #6 = { nounwind "target-features"="+fp64-fp16-denormals" } +attributes #7 = { nounwind "target-features"="-fp64-fp16-denormals" } +attributes #8 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" } diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir index 9d70f67ef491..32e6f7cc0cdc 100644 --- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir +++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir @@ -1,14 +1,14 @@ # RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s --- | - define void @test0() { ret void } - define void @test1() { ret void } - define void @test2() { ret void } - define void @test3() { ret void } - define void @test4() { ret void } - define void @test5() { ret void } - define void @loop0() { ret void } - define void @loop1() { ret void } - define void @loop2() { ret void } + define amdgpu_kernel void @test0() { ret void } + define amdgpu_kernel void @test1() { ret void } + define amdgpu_kernel void @test2() { ret void } + define amdgpu_kernel void @test3() { ret void } + define amdgpu_kernel void @test4() { ret void } + define amdgpu_kernel void @test5() { ret void } + define amdgpu_kernel void @loop0() { ret void } + define amdgpu_kernel void @loop1() { ret void } + define amdgpu_kernel void @loop2() { ret void } ... --- # Combined use/def transfer check, the basics. diff --git a/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll index cdd2c0cd4f43..6dfe1294bb47 100644 --- a/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll +++ b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll @@ -9,7 +9,7 @@ ; CHECK: ALU_PUSH_BEFORE ; CHECK-NEXT: JUMP ; CHECK-NEXT: LOOP_BREAK -define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind { +define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind { entry: %cmp5 = icmp sgt i32 %iterations, 0 br i1 %cmp5, label %for.body, label %for.end diff --git a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll index 5e1ebfde3e10..878b5ebe9409 100644 --- a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll +++ b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll @@ -9,7 +9,7 @@ ; GCN: buffer_load_dword ; GCN: ds_write2_b32 ; GCN: s_endpgm -define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 { +define amdgpu_kernel void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 { entry: %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx = shl i32 %tid, 2 diff --git a/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/test/CodeGen/AMDGPU/ds-combine-large-stride.ll new file mode 100644 index 000000000000..a723b0210ade --- /dev/null +++ b/test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -0,0 +1,412 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s + +; GCN-LABEL: ds_read32_combine_stride_400: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 +define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { +bb: + %tmp = load float, float addrspace(3)* %arg, align 4 + %tmp2 = fadd float %tmp, 0.000000e+00 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100 + %tmp4 = load float, float addrspace(3)* %tmp3, align 4 + %tmp5 = fadd float %tmp2, %tmp4 + %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200 + %tmp7 = load float, float addrspace(3)* %tmp6, align 4 + %tmp8 = fadd float %tmp5, %tmp7 + %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300 + %tmp10 = load float, float addrspace(3)* %tmp9, align 4 + %tmp11 = fadd float %tmp8, %tmp10 + %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400 + %tmp13 = load float, float addrspace(3)* %tmp12, align 4 + %tmp14 = fadd float %tmp11, %tmp13 + %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500 + %tmp16 = load float, float addrspace(3)* %tmp15, align 4 + %tmp17 = fadd float %tmp14, %tmp16 + %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600 + %tmp19 = load float, float addrspace(3)* %tmp18, align 4 + %tmp20 = fadd float %tmp17, %tmp19 + %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 + %tmp22 = load float, float addrspace(3)* %tmp21, align 4 + %tmp23 = fadd float %tmp20, %tmp22 + store float %tmp23, float *%arg1, align 4 + ret void +} + +; GCN-LABEL: ds_read32_combine_stride_400_back: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 +define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { +bb: + %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 + %tmp2 = load float, float addrspace(3)* %tmp, align 4 + %tmp3 = fadd float %tmp2, 0.000000e+00 + %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600 + %tmp5 = load float, float addrspace(3)* %tmp4, align 4 + %tmp6 = fadd float %tmp3, %tmp5 + %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500 + %tmp8 = load float, float addrspace(3)* %tmp7, align 4 + %tmp9 = fadd float %tmp6, %tmp8 + %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400 + %tmp11 = load float, float addrspace(3)* %tmp10, align 4 + %tmp12 = fadd float %tmp9, %tmp11 + %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300 + %tmp14 = load float, float addrspace(3)* %tmp13, align 4 + %tmp15 = fadd float %tmp12, %tmp14 + %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200 + %tmp17 = load float, float addrspace(3)* %tmp16, align 4 + %tmp18 = fadd float %tmp15, %tmp17 + %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100 + %tmp20 = load float, float addrspace(3)* %tmp19, align 4 + %tmp21 = fadd float %tmp18, %tmp20 + %tmp22 = load float, float addrspace(3)* %arg, align 4 + %tmp23 = fadd float %tmp21, %tmp22 + store float %tmp23, float *%arg1, align 4 + ret void +} + +; GCN-LABEL: ds_read32_combine_stride_8192: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224 +define amdgpu_kernel void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { +bb: + %tmp = load float, float addrspace(3)* %arg, align 4 + %tmp2 = fadd float %tmp, 0.000000e+00 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048 + %tmp4 = load float, float addrspace(3)* %tmp3, align 4 + %tmp5 = fadd float %tmp2, %tmp4 + %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096 + %tmp7 = load float, float addrspace(3)* %tmp6, align 4 + %tmp8 = fadd float %tmp5, %tmp7 + %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144 + %tmp10 = load float, float addrspace(3)* %tmp9, align 4 + %tmp11 = fadd float %tmp8, %tmp10 + %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192 + %tmp13 = load float, float addrspace(3)* %tmp12, align 4 + %tmp14 = fadd float %tmp11, %tmp13 + %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240 + %tmp16 = load float, float addrspace(3)* %tmp15, align 4 + %tmp17 = fadd float %tmp14, %tmp16 + %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288 + %tmp19 = load float, float addrspace(3)* %tmp18, align 4 + %tmp20 = fadd float %tmp17, %tmp19 + %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336 + %tmp22 = load float, float addrspace(3)* %tmp21, align 4 + %tmp23 = fadd float %tmp20, %tmp22 + store float %tmp23, float *%arg1, align 4 + ret void +} + +; GCN-LABEL: ds_read32_combine_stride_8192_shifted: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32 +define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { +bb: + %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2 + %tmp2 = load float, float addrspace(3)* %tmp, align 4 + %tmp3 = fadd float %tmp2, 0.000000e+00 + %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2050 + %tmp5 = load float, float addrspace(3)* %tmp4, align 4 + %tmp6 = fadd float %tmp3, %tmp5 + %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4098 + %tmp8 = load float, float addrspace(3)* %tmp7, align 4 + %tmp9 = fadd float %tmp6, %tmp8 + %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6146 + %tmp11 = load float, float addrspace(3)* %tmp10, align 4 + %tmp12 = fadd float %tmp9, %tmp11 + %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8194 + %tmp14 = load float, float addrspace(3)* %tmp13, align 4 + %tmp15 = fadd float %tmp12, %tmp14 + %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10242 + %tmp17 = load float, float addrspace(3)* %tmp16, align 4 + %tmp18 = fadd float %tmp15, %tmp17 + store float %tmp18, float *%arg1, align 4 + ret void +} + +; GCN-LABEL: ds_read64_combine_stride_400: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50 +; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150 +; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250 +; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50 +define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { +bb: + %tmp = load double, double addrspace(3)* %arg, align 8 + %tmp2 = fadd double %tmp, 0.000000e+00 + %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 50 + %tmp4 = load double, double addrspace(3)* %tmp3, align 8 + %tmp5 = fadd double %tmp2, %tmp4 + %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100 + %tmp7 = load double, double addrspace(3)* %tmp6, align 8 + %tmp8 = fadd double %tmp5, %tmp7 + %tmp9 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150 + %tmp10 = load double, double addrspace(3)* %tmp9, align 8 + %tmp11 = fadd double %tmp8, %tmp10 + %tmp12 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200 + %tmp13 = load double, double addrspace(3)* %tmp12, align 8 + %tmp14 = fadd double %tmp11, %tmp13 + %tmp15 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250 + %tmp16 = load double, double addrspace(3)* %tmp15, align 8 + %tmp17 = fadd double %tmp14, %tmp16 + %tmp18 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300 + %tmp19 = load double, double addrspace(3)* %tmp18, align 8 + %tmp20 = fadd double %tmp17, %tmp19 + %tmp21 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350 + %tmp22 = load double, double addrspace(3)* %tmp21, align 8 + %tmp23 = fadd double %tmp20, %tmp22 + store double %tmp23, double *%arg1, align 8 + ret void +} + +; GCN-LABEL: ds_read64_combine_stride_8192_shifted: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16 +; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16 +; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16 +define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { +bb: + %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1 + %tmp2 = load double, double addrspace(3)* %tmp, align 8 + %tmp3 = fadd double %tmp2, 0.000000e+00 + %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025 + %tmp5 = load double, double addrspace(3)* %tmp4, align 8 + %tmp6 = fadd double %tmp3, %tmp5 + %tmp7 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049 + %tmp8 = load double, double addrspace(3)* %tmp7, align 8 + %tmp9 = fadd double %tmp6, %tmp8 + %tmp10 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073 + %tmp11 = load double, double addrspace(3)* %tmp10, align 8 + %tmp12 = fadd double %tmp9, %tmp11 + %tmp13 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097 + %tmp14 = load double, double addrspace(3)* %tmp13, align 8 + %tmp15 = fadd double %tmp12, %tmp14 + %tmp16 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121 + %tmp17 = load double, double addrspace(3)* %tmp16, align 8 + %tmp18 = fadd double %tmp15, %tmp17 + store double %tmp18, double *%arg1, align 8 + ret void +} + +; GCN-LABEL: ds_write32_combine_stride_400: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) { +bb: + store float 1.000000e+00, float addrspace(3)* %arg, align 4 + %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 100 + store float 1.000000e+00, float addrspace(3)* %tmp, align 4 + %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200 + store float 1.000000e+00, float addrspace(3)* %tmp1, align 4 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300 + store float 1.000000e+00, float addrspace(3)* %tmp2, align 4 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400 + store float 1.000000e+00, float addrspace(3)* %tmp3, align 4 + %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500 + store float 1.000000e+00, float addrspace(3)* %tmp4, align 4 + %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600 + store float 1.000000e+00, float addrspace(3)* %tmp5, align 4 + %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 + store float 1.000000e+00, float addrspace(3)* %tmp6, align 4 + ret void +} + +; GCN-LABEL: ds_write32_combine_stride_400_back: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) { +bb: + %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 + store float 1.000000e+00, float addrspace(3)* %tmp, align 4 + %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600 + store float 1.000000e+00, float addrspace(3)* %tmp1, align 4 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500 + store float 1.000000e+00, float addrspace(3)* %tmp2, align 4 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400 + store float 1.000000e+00, float addrspace(3)* %tmp3, align 4 + %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300 + store float 1.000000e+00, float addrspace(3)* %tmp4, align 4 + %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200 + store float 1.000000e+00, float addrspace(3)* %tmp5, align 4 + %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100 + store float 1.000000e+00, float addrspace(3)* %tmp6, align 4 + store float 1.000000e+00, float addrspace(3)* %arg, align 4 + ret void +} + +; GCN-LABEL: ds_write32_combine_stride_8192: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96 +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160 +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224 +define amdgpu_kernel void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) { +bb: + store float 1.000000e+00, float addrspace(3)* %arg, align 4 + %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048 + store float 1.000000e+00, float addrspace(3)* %tmp, align 4 + %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096 + store float 1.000000e+00, float addrspace(3)* %tmp1, align 4 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144 + store float 1.000000e+00, float addrspace(3)* %tmp2, align 4 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192 + store float 1.000000e+00, float addrspace(3)* %tmp3, align 4 + %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240 + store float 1.000000e+00, float addrspace(3)* %tmp4, align 4 + %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288 + store float 1.000000e+00, float addrspace(3)* %tmp5, align 4 + %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336 + store float 1.000000e+00, float addrspace(3)* %tmp6, align 4 + ret void +} + +; GCN-LABEL: ds_write32_combine_stride_8192_shifted: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] +; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) { +bb: + %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1 + store float 1.000000e+00, float addrspace(3)* %tmp, align 4 + %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2049 + store float 1.000000e+00, float addrspace(3)* %tmp1, align 4 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4097 + store float 1.000000e+00, float addrspace(3)* %tmp2, align 4 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6145 + store float 1.000000e+00, float addrspace(3)* %tmp3, align 4 + %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8193 + store float 1.000000e+00, float addrspace(3)* %tmp4, align 4 + %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10241 + store float 1.000000e+00, float addrspace(3)* %tmp5, align 4 + ret void +} + +; GCN-LABEL: ds_write64_combine_stride_400: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 +; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150 +; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250 +; GCN-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 +define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) { +bb: + store double 1.000000e+00, double addrspace(3)* %arg, align 8 + %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 50 + store double 1.000000e+00, double addrspace(3)* %tmp, align 8 + %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100 + store double 1.000000e+00, double addrspace(3)* %tmp1, align 8 + %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150 + store double 1.000000e+00, double addrspace(3)* %tmp2, align 8 + %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200 + store double 1.000000e+00, double addrspace(3)* %tmp3, align 8 + %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250 + store double 1.000000e+00, double addrspace(3)* %tmp4, align 8 + %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300 + store double 1.000000e+00, double addrspace(3)* %tmp5, align 8 + %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350 + store double 1.000000e+00, double addrspace(3)* %tmp6, align 8 + ret void +} + +; GCN-LABEL: ds_write64_combine_stride_8192_shifted: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 +; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 +; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 +define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) { +bb: + %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1 + store double 1.000000e+00, double addrspace(3)* %tmp, align 8 + %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025 + store double 1.000000e+00, double addrspace(3)* %tmp1, align 8 + %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049 + store double 1.000000e+00, double addrspace(3)* %tmp2, align 8 + %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073 + store double 1.000000e+00, double addrspace(3)* %tmp3, align 8 + %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097 + store double 1.000000e+00, double addrspace(3)* %tmp4, align 8 + %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121 + store double 1.000000e+00, double addrspace(3)* %tmp5, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll index f461d6978f13..5997e27fd815 100644 --- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll +++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll @@ -23,7 +23,7 @@ declare void @llvm.amdgcn.s.barrier() #1 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256 ; CHECK: s_endpgm -define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 { +define amdgpu_kernel void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 { entry: %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %mul = shl nsw i32 %x.i, 1 diff --git a/test/CodeGen/AMDGPU/ds-sub-offset.ll b/test/CodeGen/AMDGPU/ds-sub-offset.ll index 16fb019ae0f3..d74bd5aa15ac 100644 --- a/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]] ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12 -define void @write_ds_sub0_offset0_global() #0 { +define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %sub1 = sub i32 0, %x.i @@ -24,7 +24,7 @@ entry: ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 ; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535 -define void @add_x_shl_neg_to_sub_max_offset() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -39,7 +39,7 @@ define void @add_x_shl_neg_to_sub_max_offset() #1 { ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13 ; GCN: ds_write_b8 [[NEG]], [[K]]{{$}} -define void @add_x_shl_neg_to_sub_max_offset_p1() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -58,7 +58,7 @@ define void @add_x_shl_neg_to_sub_max_offset_p1() #1 { ; GCN-NOT: v_sub ; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}} ; GCN: s_endpgm -define void @add_x_shl_neg_to_sub_multi_use() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -80,7 +80,7 @@ define void @add_x_shl_neg_to_sub_multi_use() #1 { ; GCN-NOT: v_sub ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}} ; GCN: s_endpgm -define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -95,7 +95,7 @@ define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]] ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255 -define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -109,7 +109,7 @@ define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]] ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} -define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 { +define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 { %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll index 9a313230e303..2c474dbe7b08 100644 --- a/test/CodeGen/AMDGPU/ds_read2.ll +++ b/test/CodeGen/AMDGPU/ds_read2.ll @@ -12,7 +12,7 @@ ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -31,7 +31,7 @@ define void @simple_read2_f32(float addrspace(1)* %out) #0 { ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -49,7 +49,7 @@ define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 ; SI: s_endpgm -define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -66,7 +66,7 @@ define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 0 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 @@ -98,7 +98,7 @@ define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { ; SI: s_barrier ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 0 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 @@ -133,7 +133,7 @@ define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 @@ -170,7 +170,7 @@ define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { ; SI: ds_read_b32 ; SI: ds_read_b32 ; SI: s_endpgm -define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { +define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 @@ -196,7 +196,7 @@ define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float ad ; SI: ds_read_b32 ; SI: ds_read_b32 ; SI: s_endpgm -define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { +define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 @@ -219,7 +219,7 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f ; SI-LABEL: {{^}}read2_ptr_is_subreg_f32: ; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}} ; SI: s_endpgm -define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1 @@ -243,7 +243,7 @@ define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 ; SI: s_endpgm -define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4 @@ -261,7 +261,7 @@ define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 ; SI: s_endpgm -define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -280,7 +280,7 @@ define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { ; SI-LABEL: @unaligned_read2_f32 ; SI-NOT: ds_read2_b32 ; SI: s_endpgm -define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 1 @@ -296,7 +296,7 @@ define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* % ; SI-LABEL: @misaligned_2_simple_read2_f32 ; SI-NOT: ds_read2_b32 ; SI: s_endpgm -define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 2 @@ -315,7 +315,7 @@ define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrs ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @simple_read2_f64(double addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 @@ -331,7 +331,7 @@ define void @simple_read2_f64(double addrspace(1)* %out) #0 { ; SI-LABEL: @simple_read2_f64_max_offset ; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 ; SI: s_endpgm -define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 @@ -349,7 +349,7 @@ define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 ; SI: s_endpgm -define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 @@ -367,7 +367,7 @@ define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 ; SI: s_endpgm -define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 4 @@ -385,7 +385,7 @@ define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3) ; SI-LABEL: @load_constant_adjacent_offsets ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { +define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 %sum = add i32 %val0, %val1 @@ -396,7 +396,7 @@ define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { ; SI-LABEL: @load_constant_disjoint_offsets ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 -define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { +define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 %sum = add i32 %val0, %val1 @@ -410,7 +410,7 @@ define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 -define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { +define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 %sum = add i64 %val0, %val1 @@ -426,7 +426,7 @@ define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 ; SI: s_endpgm -define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { +define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 %sum = add i64 %val0, %val1 @@ -437,7 +437,7 @@ define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 -define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { +define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i @@ -481,13 +481,13 @@ define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i ret void } -define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 ret void } -define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { %load = load i64, i64 addrspace(3)* %in, align 4 store i64 %load, i64 addrspace(1)* %out, align 8 ret void diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll index 4a3f3fb99700..9668743cf128 100644 --- a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -10,7 +10,7 @@ ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12 -define void @offset_order(float addrspace(1)* %out) { +define amdgpu_kernel void @offset_order(float addrspace(1)* %out) { entry: %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0 %val0 = load float, float addrspace(3)* %ptr0 diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll index 9d8375d64037..3dfdaf3936a6 100644 --- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -12,7 +12,7 @@ ; CI: s_waitcnt lgkmcnt(0) ; CI: buffer_store_dwordx2 [[RESULT]] ; CI: s_endpgm -define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4 @@ -26,7 +26,7 @@ define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) ; CI: s_waitcnt lgkmcnt(0) ; CI: buffer_store_dwordx2 [[RESULT]] ; CI: s_endpgm -define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0 @@ -43,7 +43,7 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 { ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm -define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4 @@ -68,7 +68,7 @@ define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm -define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4 @@ -88,7 +88,7 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { ; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} ; CI: buffer_store_dwordx4 [[REG_ZW]] ; CI: s_endpgm -define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8 @@ -101,7 +101,7 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) ; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} ; CI: buffer_store_dwordx4 [[REG_ZW]] ; CI: s_endpgm -define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0 @@ -117,7 +117,7 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { ; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}} ; CI: s_endpgm -define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0 @@ -138,7 +138,7 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { ; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 ; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 ; CI: s_endpgm -define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0 @@ -150,10 +150,10 @@ define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { ; Do scalar loads into the super register we need. ; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} -; CI-NOT: v_mov +; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}} ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}} ; CI: s_endpgm -define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1 @@ -173,10 +173,10 @@ define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspa ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; CI-NOT: v_mov +; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}} ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}} ; CI: s_endpgm -define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1 diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll index 99f01b4f2622..81b35a46aa18 100644 --- a/test/CodeGen/AMDGPU/ds_read2st64.ll +++ b/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -10,7 +10,7 @@ ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -29,7 +29,7 @@ define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 @@ -49,7 +49,7 @@ define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace( ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 @@ -69,7 +69,7 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add ; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}} ; SI: s_endpgm -define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 @@ -86,7 +86,7 @@ define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, floa ; SI-LABEL: @odd_invalid_read2st64_f32_0 ; SI-NOT: ds_read2st64_b32 ; SI: s_endpgm -define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i %val0 = load float, float addrspace(3)* %arrayidx0, align 4 @@ -102,7 +102,7 @@ define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { ; SI-LABEL: @odd_invalid_read2st64_f32_1 ; SI-NOT: ds_read2st64_b32 ; SI: s_endpgm -define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 @@ -122,7 +122,7 @@ define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { +define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 @@ -141,7 +141,7 @@ define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 @@ -161,7 +161,7 @@ define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspac ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 ; SI: s_endpgm -define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 4 @@ -181,7 +181,7 @@ define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspac ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 256 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 @@ -197,11 +197,11 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a ; SI-LABEL: @simple_read2st64_f64_over_max_offset ; SI-NOT: ds_read2st64_b64 -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} +; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 +; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] ; SI: s_endpgm -define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 @@ -218,7 +218,7 @@ define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, dou ; SI-LABEL: @invalid_read2st64_f64_odd_offset ; SI-NOT: ds_read2st64_b64 ; SI: s_endpgm -define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 @@ -239,7 +239,7 @@ define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double ; SI-NOT: ds_read2st_b64 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 ; SI: s_endpgm -define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i %val0 = load double, double addrspace(3)* %arrayidx0, align 8 diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll index ae230dac9378..ab1cf0ba25b5 100644 --- a/test/CodeGen/AMDGPU/ds_write2.ll +++ b/test/CodeGen/AMDGPU/ds_write2.ll @@ -9,7 +9,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 ; SI: s_endpgm -define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i %val = load float, float addrspace(1)* %in.gep, align 4 @@ -27,7 +27,7 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 @@ -46,7 +46,7 @@ define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 ; SI: s_endpgm -define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i @@ -65,7 +65,7 @@ define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 ; SI: s_endpgm -define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i @@ -86,7 +86,7 @@ define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 @@ -107,7 +107,7 @@ define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 @@ -126,7 +126,7 @@ define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x floa ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 @@ -146,7 +146,7 @@ define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x floa ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 ; SI: s_endpgm -define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 @@ -164,7 +164,7 @@ define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 ; SI: s_endpgm -define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i @@ -182,7 +182,7 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x @@ -212,7 +212,7 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 ; SI: s_endpgm -define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { +define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x @@ -243,7 +243,7 @@ define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, f ; SI: ds_write_b32 ; SI: ds_write_b32 ; SI: s_endpgm -define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { +define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i @@ -270,7 +270,7 @@ define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float add ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 ; SI: s_endpgm -define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i %val = load double, double addrspace(1)* %in.gep, align 8 @@ -288,7 +288,7 @@ define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 ; SI: s_endpgm -define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i %val = load double, double addrspace(1)* %in.gep, align 8 @@ -306,7 +306,7 @@ define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, doubl ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 ; SI: s_endpgm -define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 @@ -325,7 +325,7 @@ define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace ; SI-LABEL: @store_constant_adjacent_offsets ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -define void @store_constant_adjacent_offsets() { +define amdgpu_kernel void @store_constant_adjacent_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 ret void @@ -335,7 +335,7 @@ define void @store_constant_adjacent_offsets() { ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 -define void @store_constant_disjoint_offsets() { +define amdgpu_kernel void @store_constant_disjoint_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 ret void @@ -348,7 +348,7 @@ define void @store_constant_disjoint_offsets() { ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 ; SI: s_endpgm -define void @store_misaligned64_constant_offsets() { +define amdgpu_kernel void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 ret void @@ -362,7 +362,7 @@ define void @store_misaligned64_constant_offsets() { ; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; SI: s_endpgm -define void @store_misaligned64_constant_large_offsets() { +define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 ret void @@ -371,7 +371,7 @@ define void @store_misaligned64_constant_large_offsets() { @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 -define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 %val = load float, float addrspace(1)* %in @@ -410,7 +410,7 @@ define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, f ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}} ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}} ; CI: s_endpgm -define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll index 872e77361406..a395af34b67b 100644 --- a/test/CodeGen/AMDGPU/ds_write2st64.ll +++ b/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -7,7 +7,7 @@ ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1 ; SI: s_endpgm -define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i %val = load float, float addrspace(1)* %in.gep, align 4 @@ -25,7 +25,7 @@ define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float add ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 ; SI: s_endpgm -define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 @@ -46,7 +46,7 @@ define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float add ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 ; SI: s_endpgm -define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 @@ -66,7 +66,7 @@ define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, fl ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], ; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 ; SI: s_endpgm -define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 @@ -85,7 +85,7 @@ define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, d ; SI-NOT: ds_write2st64_b64 ; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8 ; SI: s_endpgm -define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { +define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i %val = load double, double addrspace(1)* %in.gep, align 8 diff --git a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index 580dc00f935e..b1107ea7fbcb 100644 --- a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -4,7 +4,7 @@ ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) { +define amdgpu_kernel void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) { %alloca = alloca i32, i32 %n store volatile i32 0, i32* %alloca ret void diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll new file mode 100644 index 000000000000..ace01593808b --- /dev/null +++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -0,0 +1,110 @@ +; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Most of these cases that don't trigger because of broken cost +; heuristics. Should not need -stress-early-ifcvt + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64: +; GCN: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GCN: v_cmp_neq_f64_e32 vcc, 1.0, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} +; GCN: v_add_f64 v{{\[}}[[ADD_LO:[0-9]+]]:[[ADD_HI:[0-9]+]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} +; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc +; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc +; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +entry: + %v = load double, double addrspace(1)* %in + %cc = fcmp oeq double %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd double %v, %v + br label %endif + +endif: + %r = phi double [ %v, %entry ], [ %u, %if ] + store double %r, double addrspace(1)* %out + ret void +} + +; vcc branch with SGPR inputs +; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle64: +; GCN: v_cmp_neq_f64 +; GCN: v_add_f64 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { +entry: + %v = load double, double addrspace(2)* %in + %cc = fcmp oeq double %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd double %v, %v + br label %endif + +endif: + %r = phi double [ %v, %entry ], [ %u, %if ] + store double %r, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle96: +; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 + +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: s_mov_b64 vcc, [[CMP]] + +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc + +; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx2 +define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <3 x i32>, <3 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <3 x i32> %v, %v + br label %endif + +endif: + %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ] + store <3 x i32> %r, <3 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle128: +; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 + +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: s_mov_b64 vcc, [[CMP]] + +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc + +; GCN: buffer_store_dwordx4 +define amdgpu_kernel void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <4 x i32>, <4 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <4 x i32> %v, %v + br label %endif + +endif: + %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ] + store <4 x i32> %r, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll new file mode 100644 index 000000000000..9439130deb9e --- /dev/null +++ b/test/CodeGen/AMDGPU/early-if-convert.ll @@ -0,0 +1,454 @@ +; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: This leaves behind a now unnecessary and with exec + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] +; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]] +; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %else + +if: + %u0 = fadd float %v, %v + br label %endif + +else: + %u1 = fmul float %v, %v + br label %endif + +endif: + %r = phi float [ %u0, %if ], [ %u1, %else ] + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber: +; GCN: ; clobber vcc +; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc +; GCN: s_mov_b64 vcc, [[CMP]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 { +entry: + %v = load i32, i32 addrspace(1)* %in + %cc = fcmp oeq float %k, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + call void asm "; clobber $0", "~{VCC}"() #0 + %u = add i32 %v, %v + br label %endif + +endif: + %r = phi i32 [ %v, %entry ], [ %u, %if ] + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; Longest chain of cheap instructions to convert +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap: +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u.0 = fmul float %v, %v + %u.1 = fmul float %v, %u.0 + %u.2 = fmul float %v, %u.1 + %u.3 = fmul float %v, %u.2 + %u.4 = fmul float %v, %u.3 + %u.5 = fmul float %v, %u.4 + %u.6 = fmul float %v, %u.5 + %u.7 = fmul float %v, %u.6 + %u.8 = fmul float %v, %u.7 + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u.8, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Short chain of cheap instructions to not convert +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive: +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u.0 = fmul float %v, %v + %u.1 = fmul float %v, %u.0 + %u.2 = fmul float %v, %u.1 + %u.3 = fmul float %v, %u.2 + %u.4 = fmul float %v, %u.3 + %u.5 = fmul float %v, %u.4 + %u.6 = fmul float %v, %u.5 + %u.7 = fmul float %v, %u.6 + %u.8 = fmul float %v, %u.7 + %u.9 = fmul float %v, %u.8 + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u.9, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Should still branch over fdiv expansion +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive: +; GCN: v_cmp_neq_f32_e32 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_div_scale_f32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fdiv float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; vcc branch with SGPR inputs +; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle: +; GCN: v_cmp_neq_f32_e64 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: s_add_i32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 { +entry: + %v = load i32, i32 addrspace(2)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add i32 %v, %v + br label %endif + +endif: + %r = phi i32 [ %v, %entry ], [ %u, %if ] + store i32 %r, i32 addrspace(1)* %out + ret void + +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load: +; GCN: v_cndmask_b32 +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 { +entry: + %v = load float, float addrspace(2)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Due to broken cost heuristic, this is not if converted like +; test_vccnz_ifcvt_triangle_constant_load even though it should be. + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload: +; GCN: v_cndmask_b32 +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 { +entry: + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Scalar branch and scalar inputs +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle: +; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]] +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]] +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load i32, i32 addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add i32 %v, %v + br label %endif + +endif: + %r = phi i32 [ %v, %entry ], [ %u, %if ] + call void asm sideeffect "; reg use $0", "s"(i32 %r) #0 + ret void +} + +; FIXME: Should be able to use VALU compare and select +; Scalar branch but VGPR select operands +; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle: +; GCN: s_cmp_lg_u32 +; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_add_f32_e32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64: +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load i64, i64 addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add i64 %v, %v + br label %endif + +endif: + %r = phi i64 [ %v, %entry ], [ %u, %if ] + call void asm sideeffect "; reg use $0", "s"(i64 %r) #0 + ret void +} + +; TODO: Can do s_cselect_b64; s_cselect_b32 +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96: +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load <3 x i32>, <3 x i32> addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add <3 x i32> %v, %v + br label %endif + +endif: + %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ] + %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> + call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0 + ret void +} + +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128: +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load <4 x i32>, <4 x i32> addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add <4 x i32> %v, %v + br label %endif + +endif: + %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ] + call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0 + ret void +} + +; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select: +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}} +define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %else, label %if + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ifcvt_undef_scc: +; GCN: {{^}}; BB#0: +; GCN-NEXT: s_load_dwordx2 +; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0 +define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) { +entry: + br i1 undef, label %else, label %if + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256: +; GCN: v_cmp_neq_f32 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_add_i32 +; GCN: v_add_i32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <8 x i32>, <8 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <8 x i32> %v, %v + br label %endif + +endif: + %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ] + store <8 x i32> %r, <8 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512: +; GCN: v_cmp_neq_f32 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_add_i32 +; GCN: v_add_i32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <16 x i32>, <16 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <16 x i32> %v, %v + br label %endif + +endif: + %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ] + store <16 x i32> %r, <16 x i32> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/early-inline-alias.ll b/test/CodeGen/AMDGPU/early-inline-alias.ll new file mode 100644 index 000000000000..42dfa4e7ab4f --- /dev/null +++ b/test/CodeGen/AMDGPU/early-inline-alias.ll @@ -0,0 +1,12 @@ +; RUN: opt -mtriple=amdgcn-- -O1 -S -inline-threshold=1 %s | FileCheck %s + +; CHECK: @add1alias = alias i32 (i32), i32 (i32)* @add1 +; CHECK: @add1alias2 = alias i32 (i32), i32 (i32)* @add1 + +@add1alias = alias i32 (i32), i32 (i32)* @add1 +@add1alias2 = alias i32 (i32), i32 (i32)* @add1 + +define i32 @add1(i32) { + %2 = add nsw i32 %0, 1 + ret i32 %2 +} diff --git a/test/CodeGen/AMDGPU/early-inline.ll b/test/CodeGen/AMDGPU/early-inline.ll new file mode 100644 index 000000000000..c871d54bec7e --- /dev/null +++ b/test/CodeGen/AMDGPU/early-inline.ll @@ -0,0 +1,25 @@ +; RUN: opt -mtriple=amdgcn-- -O1 -S -inline-threshold=1 -amdgpu-early-inline-all %s | FileCheck %s + +; CHECK: @c_alias +@c_alias = alias i32 (i32), i32 (i32)* @callee + +define i32 @callee(i32 %x) { +entry: + %mul1 = mul i32 %x, %x + %mul2 = mul i32 %mul1, %x + %mul3 = mul i32 %mul1, %mul2 + %mul4 = mul i32 %mul3, %mul2 + %mul5 = mul i32 %mul4, %mul3 + ret i32 %mul5 +} + +; CHECK-LABEL: @caller +; CHECK: mul i32 +; CHECK-NOT: call i32 + +define amdgpu_kernel void @caller(i32 %x) { +entry: + %res = call i32 @callee(i32 %x) + store volatile i32 %res, i32 addrspace(1)* undef + ret void +} diff --git a/test/CodeGen/AMDGPU/elf.ll b/test/CodeGen/AMDGPU/elf.ll index 628dd5ec839e..b22f8608d7e3 100644 --- a/test/CodeGen/AMDGPU/elf.ll +++ b/test/CodeGen/AMDGPU/elf.ll @@ -24,11 +24,13 @@ ; TONGA-NEXT: .long 704 ; CONFIG: .p2align 8 ; CONFIG: test: -define amdgpu_ps void @test(i32 %p) { +define amdgpu_ps void @test(i32 %p) #0 { %i = add i32 %p, 2 %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r, float %r, float %r, float %r, i1 true, i1 false) ret void } -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/elf.r600.ll b/test/CodeGen/AMDGPU/elf.r600.ll index 51cd08500932..93c5e5575033 100644 --- a/test/CodeGen/AMDGPU/elf.r600.ll +++ b/test/CodeGen/AMDGPU/elf.r600.ll @@ -9,7 +9,7 @@ ; CONFIG-NEXT: .long 2 ; CONFIG-NEXT: .long 165900 ; CONFIG-NEXT: .long 0 -define void @test(float addrspace(1)* %out, i32 %p) { +define amdgpu_kernel void @test(float addrspace(1)* %out, i32 %p) { %i = add i32 %p, 2 %r = bitcast i32 %i to float store float %r, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/else.ll b/test/CodeGen/AMDGPU/else.ll index ef1e64763d4a..22338e4f50e5 100644 --- a/test/CodeGen/AMDGPU/else.ll +++ b/test/CodeGen/AMDGPU/else.ll @@ -1,12 +1,12 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}else_no_execfix: ; CHECK: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], ; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]] ; CHECK-NEXT: ; mask branch -define amdgpu_ps float @else_no_execfix(i32 %z, float %v) { +define amdgpu_ps float @else_no_execfix(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -33,7 +33,7 @@ end: ; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]] ; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]] ; CHECK-NEXT: ; mask branch -define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) { +define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -44,8 +44,7 @@ if: else: %c = fmul float %v, 3.0 - %c.i = bitcast float %c to i32 - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %v.else = extractelement <4 x float> %tex, i32 0 br label %end @@ -55,6 +54,9 @@ end: ret void } -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone +attributes #0 = { nounwind } +attributes #1 = { nounwind writeonly } +attributes #2 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/empty-function.ll b/test/CodeGen/AMDGPU/empty-function.ll index a060900811ea..1231cb4d1de2 100644 --- a/test/CodeGen/AMDGPU/empty-function.ll +++ b/test/CodeGen/AMDGPU/empty-function.ll @@ -7,14 +7,14 @@ ; SI-LABEL: {{^}}empty_function_ret: ; SI: s_endpgm ; SI: codeLenInByte = 4 -define void @empty_function_ret() #0 { +define amdgpu_kernel void @empty_function_ret() #0 { ret void } ; SI: .text ; SI-LABEL: {{^}}empty_function_unreachable: ; SI: codeLenInByte = 0 -define void @empty_function_unreachable() #0 { +define amdgpu_kernel void @empty_function_unreachable() #0 { unreachable } diff --git a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll new file mode 100644 index 000000000000..6eb1fc1d0cc2 --- /dev/null +++ b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll @@ -0,0 +1,22 @@ +; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE %s +; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s +; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s + +; Test that the -enable-no-signed-zeros-fp-math flag works + +; GCN-LABEL: {{^}}fneg_fsub_f32: +; GCN: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] + +; GCN-UNSAFE-NOT: xor +define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %a = load float, float addrspace(1)* %in, align 4 + %b = load float, float addrspace(1)* %b_ptr, align 4 + %result = fsub float %a, %b + %neg.result = fsub float -0.0, %result + store float %neg.result, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll index c67095438ee5..bd861e0c663e 100644 --- a/test/CodeGen/AMDGPU/endcf-loop-header.ll +++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll @@ -12,7 +12,7 @@ ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}} ; CHECK-NOT: s_or_b64 exec, exec ; CHECK: s_cbranch_execnz [[LOOP_LABEL]] -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { entry: %cond = call i32 @llvm.r600.read.tidig.x() #0 %tmp0 = icmp eq i32 %cond, 0 diff --git a/test/CodeGen/AMDGPU/env-amdgiz.ll b/test/CodeGen/AMDGPU/env-amdgiz.ll new file mode 100644 index 000000000000..70e4fb30d3aa --- /dev/null +++ b/test/CodeGen/AMDGPU/env-amdgiz.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa-amdgiz -verify-machineinstrs < %s +; Just check the target feature and data layout is accepted without error. + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" +target triple = "amdgcn-amd-amdhsa-amdgiz" + +define void @foo() { +entry: + ret void +} + diff --git a/test/CodeGen/AMDGPU/env-amdgizcl.ll b/test/CodeGen/AMDGPU/env-amdgizcl.ll new file mode 100644 index 000000000000..feb213562c80 --- /dev/null +++ b/test/CodeGen/AMDGPU/env-amdgizcl.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa-amdgizcl -verify-machineinstrs < %s +; Just check the target feature and data layout is accepted without error. + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" +target triple = "amdgcn-amd-amdhsa-amdgizcl" + +define void @foo() { +entry: + ret void +} + diff --git a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll index 8ef54b9e95d3..40d115bfc060 100644 --- a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll +++ b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_tahiti -define void @use_too_many_sgprs_tahiti() #0 { +define amdgpu_kernel void @use_too_many_sgprs_tahiti() #0 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () @@ -20,7 +20,7 @@ define void @use_too_many_sgprs_tahiti() #0 { } ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire -define void @use_too_many_sgprs_bonaire() #1 { +define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () @@ -39,7 +39,7 @@ define void @use_too_many_sgprs_bonaire() #1 { } ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr -define void @use_too_many_sgprs_bonaire_flat_scr() #1 { +define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () @@ -59,7 +59,7 @@ define void @use_too_many_sgprs_bonaire_flat_scr() #1 { } ; ERROR: error: scalar registers limit of 96 exceeded (98) in use_too_many_sgprs_iceland -define void @use_too_many_sgprs_iceland() #2 { +define amdgpu_kernel void @use_too_many_sgprs_iceland() #2 { call void asm sideeffect "", "~{VCC}" () call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () @@ -77,7 +77,7 @@ define void @use_too_many_sgprs_iceland() #2 { } ; ERROR: error: addressable scalar registers limit of 102 exceeded (103) in use_too_many_sgprs_fiji -define void @use_too_many_sgprs_fiji() #3 { +define amdgpu_kernel void @use_too_many_sgprs_fiji() #3 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () diff --git a/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll b/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll index cf384da2c5be..0fa06b87eba2 100644 --- a/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll +++ b/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: and_zext: ; GCN: v_and_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]] -define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id %a = load i16, i16 addrspace(1)* %in @@ -18,7 +18,7 @@ define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { ; GCN-LABEL: or_zext: ; GCN: v_or_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]] -define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id %a = load i16, i16 addrspace(1)* %in @@ -33,7 +33,7 @@ define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { ; GCN-LABEL: xor_zext: ; GCN: v_xor_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]] -define void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id %a = load i16, i16 addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/extload-align.ll b/test/CodeGen/AMDGPU/extload-align.ll index 9d2eb74c7ba9..4644800421d8 100644 --- a/test/CodeGen/AMDGPU/extload-align.ll +++ b/test/CodeGen/AMDGPU/extload-align.ll @@ -9,7 +9,7 @@ ; DEBUG: mem:LD2[]{{[^(]}} ; DEBUG: {{^}}# End machine code for function extload_align. -define void @extload_align(i32* %out, i32 %index) #0 { +define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 { %v0 = alloca [4 x i16] %a1 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 0 %a2 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 1 diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll index 6cebe5f495c5..fd298b361d03 100644 --- a/test/CodeGen/AMDGPU/extload-private.ll +++ b/test/CodeGen/AMDGPU/extload-private.ll @@ -2,8 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i8_sext_private: -; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}} -define void @load_i8_sext_private(i32 addrspace(1)* %out) { +; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8 %tmp1 = load i8, i8* %tmp0 @@ -13,8 +13,8 @@ entry: } ; FUNC-LABEL: {{^}}load_i8_zext_private: -; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}} -define void @load_i8_zext_private(i32 addrspace(1)* %out) { +; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8 %tmp1 = load i8, i8* %tmp0 @@ -24,8 +24,8 @@ entry: } ; FUNC-LABEL: {{^}}load_i16_sext_private: -; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}} -define void @load_i16_sext_private(i32 addrspace(1)* %out) { +; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16 %tmp1 = load i16, i16* %tmp0 @@ -35,8 +35,8 @@ entry: } ; FUNC-LABEL: {{^}}load_i16_zext_private: -; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}} -define void @load_i16_zext_private(i32 addrspace(1)* %out) { +; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16 %tmp1 = load volatile i16, i16* %tmp0 diff --git a/test/CodeGen/AMDGPU/extload.ll b/test/CodeGen/AMDGPU/extload.ll index 8b3e087d1f45..a7b8e86220aa 100644 --- a/test/CodeGen/AMDGPU/extload.ll +++ b/test/CodeGen/AMDGPU/extload.ll @@ -10,7 +10,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], ; EG: VTX_READ_32 [[VAL]] -define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind { +define amdgpu_kernel void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind { %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)* %load = load i32, i32 addrspace(1)* %cast %x = bitcast i32 %load to <4 x i8> @@ -25,7 +25,7 @@ define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 a ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], ; EG: VTX_READ_32 [[VAL]] -define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind { +define amdgpu_kernel void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind { %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)* %load = load i32, i32 addrspace(1)* %cast %x = bitcast i32 %load to <2 x i16> @@ -40,7 +40,7 @@ define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i1 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] ; EG: LDS_WRITE * [[VAL]] -define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind { +define amdgpu_kernel void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind { %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)* %load = load i32, i32 addrspace(3)* %cast %x = bitcast i32 %load to <4 x i8> @@ -55,7 +55,7 @@ define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 ad ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] ; EG: LDS_WRITE * [[VAL]] -define void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind { +define amdgpu_kernel void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind { %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)* %load = load i32, i32 addrspace(3)* %cast %x = bitcast i32 %load to <2 x i16> diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll index 4edff152e66e..be85ca933c33 100644 --- a/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll +++ b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll @@ -13,7 +13,7 @@ ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dword -define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, +define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, <4 x i32> addrspace(1)* noalias %out1, i32 addrspace(1)* noalias %out2, i32 addrspace(1)* %in) { @@ -55,7 +55,7 @@ define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noal ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dword -define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, +define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0, <4 x i32> addrspace(1)* noalias %out1, i32 addrspace(1)* noalias %out2, i32 addrspace(1)* %in) { @@ -99,7 +99,7 @@ define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace( ; GCN: buffer_store_dwordx2 ; GCN: buffer_store_dwordx2 -define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0, +define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0, <4 x i32> addrspace(1)* noalias %out1, i64 addrspace(1)* noalias %out2, i32 addrspace(1)* %in) { diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll new file mode 100644 index 000000000000..1f567ae05081 --- /dev/null +++ b/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -0,0 +1,128 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}extract_vector_elt_v2f16: +; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 +; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]] +; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] +; GCN-DAG: buffer_store_short [[VELT0]] +; GCN-DAG: buffer_store_short [[VELT1]] +define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { + %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr + %p0 = extractelement <2 x half> %vec, i32 0 + %p1 = extractelement <2 x half> %vec, i32 1 + %out1 = getelementptr half, half addrspace(1)* %out, i32 10 + store half %p1, half addrspace(1)* %out, align 2 + store half %p0, half addrspace(1)* %out1, align 2 + ret void +} + +; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_sgpr: +; GCN: s_load_dword [[IDX:s[0-9]+]] +; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 16 +; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]] +; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] +; GCN: buffer_store_short [[VELT1]] +; GCN: ScratchSize: 0 +define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 { + %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr + %elt = extractelement <2 x half> %vec, i32 %idx + store half %elt, half addrspace(1)* %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_vgpr: +; GCN-DAG: s_load_dword [[VEC:s[0-9]+]] +; GCN-DAG: {{flat|buffer}}_load_dword [[IDX:v[0-9]+]] +; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 16, [[IDX]] + +; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]] +; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]] + + +; SI: buffer_store_short [[ELT]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] +; GCN: ScratchSize: 0{{$}} +define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr + %idx = load i32, i32 addrspace(1)* %gep + %elt = extractelement <2 x half> %vec, i32 %idx + store half %elt, half addrspace(1)* %out.gep, align 2 + ret void +} + +; GCN-LABEL: {{^}}extract_vector_elt_v3f16: +; GCN: buffer_load_ushort +; GCN: buffer_store_short +; GCN: buffer_store_short +define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 { + %p0 = extractelement <3 x half> %foo, i32 0 + %p1 = extractelement <3 x half> %foo, i32 2 + %out1 = getelementptr half, half addrspace(1)* %out, i32 1 + store half %p1, half addrspace(1)* %out, align 2 + store half %p0, half addrspace(1)* %out1, align 2 + ret void +} + +; GCN-LABEL: {{^}}extract_vector_elt_v4f16: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_store_short +; GCN: buffer_store_short +define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 { + %p0 = extractelement <4 x half> %foo, i32 0 + %p1 = extractelement <4 x half> %foo, i32 2 + %out1 = getelementptr half, half addrspace(1)* %out, i32 10 + store half %p1, half addrspace(1)* %out, align 2 + store half %p0, half addrspace(1)* %out1, align 2 + ret void +} + +; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort + +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short + +; GCN: buffer_load_ushort +; GCN: buffer_store_short +define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 { + %p0 = extractelement <3 x half> %foo, i32 %idx + %out1 = getelementptr half, half addrspace(1)* %out, i32 1 + store half %p0, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4f16: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort + +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short + +; GCN: buffer_load_ushort +; GCN: buffer_store_short +define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 { + %p0 = extractelement <4 x half> %foo, i32 %idx + %out1 = getelementptr half, half addrspace(1)* %out, i32 1 + store half %p0, half addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll index 4594379dae03..db5bf0b4e808 100644 --- a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll +++ b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll @@ -5,7 +5,7 @@ ; GCN: buffer_load_dwordx4 ; GCN: buffer_load_dwordx2 ; GCN: buffer_store_dwordx2 -define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in %elt = extractelement <3 x double> %ld, i32 2 store volatile double %elt, double addrspace(1)* %out @@ -13,14 +13,14 @@ define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64: -define void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64: -define void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index c407f0efffb4..9b117d48a980 100644 --- a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -1,25 +1,67 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}extract_vector_elt_v2i16: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) #0 { - %p0 = extractelement <2 x i16> %foo, i32 0 - %p1 = extractelement <2 x i16> %foo, i32 1 +; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 +; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]] +; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] +; GCN-DAG: buffer_store_short [[VELT0]] +; GCN-DAG: buffer_store_short [[VELT1]] +define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %p0 = extractelement <2 x i16> %vec, i32 0 + %p1 = extractelement <2 x i16> %vec, i32 1 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10 store i16 %p1, i16 addrspace(1)* %out, align 2 store i16 %p0, i16 addrspace(1)* %out1, align 2 ret void } +; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_sgpr: +; GCN: s_load_dword [[IDX:s[0-9]+]] +; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 16 +; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]] +; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] +; GCN: buffer_store_short [[VELT1]] +; GCN: ScratchSize: 0 +define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %elt = extractelement <2 x i16> %vec, i32 %idx + store i16 %elt, i16 addrspace(1)* %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_vgpr: +; GCN-DAG: s_load_dword [[VEC:s[0-9]+]] +; GCN-DAG: {{flat|buffer}}_load_dword [[IDX:v[0-9]+]] +; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 16, [[IDX]] + +; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]] +; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]] + +; SI: buffer_store_short [[ELT]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] +; GCN: ScratchSize: 0{{$}} +define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %idx = load volatile i32, i32 addrspace(1)* %gep + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %elt = extractelement <2 x i16> %vec, i32 %idx + store i16 %elt, i16 addrspace(1)* %out.gep, align 2 + ret void +} + ; GCN-LABEL: {{^}}extract_vector_elt_v3i16: ; GCN: buffer_load_ushort ; GCN: buffer_store_short ; GCN: buffer_store_short -define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 { %p0 = extractelement <3 x i16> %foo, i32 0 %p1 = extractelement <3 x i16> %foo, i32 2 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 @@ -29,16 +71,23 @@ define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 } ; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 { +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short + +; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c +; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30 +; GFX9-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], [[LOAD0]] +; GFX9-DAG: buffer_store_short [[VLOAD0]], off +; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]] +; GFX9-DAG: buffer_store_short [[VLOAD1]], off +define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 { %p0 = extractelement <4 x i16> %foo, i32 0 %p1 = extractelement <4 x i16> %foo, i32 2 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10 - store i16 %p1, i16 addrspace(1)* %out, align 2 - store i16 %p0, i16 addrspace(1)* %out1, align 2 + store volatile i16 %p1, i16 addrspace(1)* %out, align 2 + store volatile i16 %p0, i16 addrspace(1)* %out1, align 2 ret void } @@ -47,13 +96,16 @@ define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short + +; GFX9: buffer_store_dword +; GFX9: buffer_store_dword ; GCN: buffer_load_ushort ; GCN: buffer_store_short -define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { %p0 = extractelement <3 x i16> %foo, i32 %idx %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 %p0, i16 addrspace(1)* %out @@ -61,23 +113,32 @@ define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short -; GCN: buffer_load_ushort -; GCN: buffer_store_short -define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 { +; SICIVI: buffer_load_ushort +; SICIVI: buffer_store_short + +; GFX9: s_load_dword +; GFX9: buffer_store_dword +; GFX9: buffer_store_dword +; GFX9: buffer_load_ushort +; GFX9: buffer_store_short +define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 { %p0 = extractelement <4 x i16> %foo, i32 %idx %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 %p0, i16 addrspace(1)* %out ret void } +declare i32 @llvm.amdgcn.workitem.id.x() #1 + attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll index 1df91c93329a..a8d127879a32 100644 --- a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll +++ b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll @@ -8,7 +8,7 @@ ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dwordx2 -define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 { +define amdgpu_kernel void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 { %vec = bitcast i64 %val to <2 x i32> %elt0 = extractelement <2 x i32> %vec, i32 0 %elt1 = extractelement <2 x i32> %vec, i32 1 @@ -20,7 +20,7 @@ define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspa } ; GCN-LABEL: {{^}}extract_vector_elt_v2i64: -define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 { %p0 = extractelement <2 x i64> %foo, i32 0 %p1 = extractelement <2 x i64> %foo, i32 1 %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1 @@ -30,14 +30,14 @@ define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64: -define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <2 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2: -define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 { %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo %or = or <2 x i64> %load, %arst %dynelt = extractelement <2 x i64> %or, i32 %elt @@ -46,14 +46,14 @@ define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> ad } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64: -define void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64: -define void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 { +define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index 6f4ae827f432..b7d768fd5525 100644 --- a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}extract_vector_elt_v1i8: ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte -define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 { %p0 = extractelement <1 x i8> %foo, i32 0 store i8 %p0, i8 addrspace(1)* %out ret void @@ -15,7 +15,7 @@ define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 { ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 { %p0 = extractelement <2 x i8> %foo, i32 0 %p1 = extractelement <2 x i8> %foo, i32 1 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -29,7 +29,7 @@ define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 { ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 { %p0 = extractelement <3 x i8> %foo, i32 0 %p1 = extractelement <3 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -43,7 +43,7 @@ define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 { ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 { %p0 = extractelement <4 x i8> %foo, i32 0 %p1 = extractelement <4 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -57,7 +57,7 @@ define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 { ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 { %p0 = extractelement <8 x i8> %foo, i32 0 %p1 = extractelement <8 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -71,7 +71,7 @@ define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 { ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 { %p0 = extractelement <16 x i8> %foo, i32 0 %p1 = extractelement <16 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -85,7 +85,7 @@ define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 { %p0 = extractelement <32 x i8> %foo, i32 0 %p1 = extractelement <32 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -99,7 +99,7 @@ define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 { +define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 { %p0 = extractelement <64 x i8> %foo, i32 0 %p1 = extractelement <64 x i8> %foo, i32 2 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -120,7 +120,7 @@ define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 ; GCN: buffer_store_byte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte -define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 { %p0 = extractelement <3 x i8> %foo, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 %p0, i8 addrspace(1)* %out @@ -141,7 +141,7 @@ define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %fo ; GCN: buffer_store_byte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte -define void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 { +define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 { %p0 = extractelement <4 x i8> %foo, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 %p0, i8 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index e160c20a03a0..34999fa3aea4 100644 --- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -7,7 +7,7 @@ ; GCN-DAG: buffer_load_dword [[A:v[0-9]+]] ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]] ; GCN: buffer_store_dword [[ADD]] -define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { +define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { %a = load i64, i64 addrspace(1)* %in %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x i32> @@ -20,7 +20,7 @@ define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspa ; GCN: buffer_load_dwordx2 ; GCN: v_add_f64 ; GCN: buffer_store_dword v -define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) { +define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) { %a = load double, double addrspace(1)* %in %add = fadd double %a, %b %val.bc = bitcast double %add to <2 x i32> @@ -33,7 +33,7 @@ define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrs ; GCN: buffer_load_dwordx2 ; GCN: v_add_i32 ; GCN: buffer_store_dword -define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { +define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { %a = load i64, i64 addrspace(1)* %in %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x float> @@ -45,7 +45,7 @@ define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 add ; GCN-LABEL: {{^}}no_extract_volatile_load_extract0: ; GCN: buffer_load_dwordx4 ; GCN: buffer_store_dword v -define void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %elt0 = extractelement <4 x i32> %vec, i32 0 @@ -57,7 +57,7 @@ entry: ; GCN: buffer_load_dwordx4 ; GCN: buffer_store_dword v -define void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %elt2 = extractelement <4 x i32> %vec, i32 2 @@ -68,7 +68,7 @@ entry: ; GCN-LABEL: {{^}}no_extract_volatile_load_dynextract: ; GCN: buffer_load_dwordx4 ; GCN: buffer_store_dword v -define void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { entry: %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %eltN = extractelement <4 x i32> %vec, i32 %idx diff --git a/test/CodeGen/AMDGPU/fabs.f16.ll b/test/CodeGen/AMDGPU/fabs.f16.ll index c64aa6228c71..d4ef7124a334 100644 --- a/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fabs.f16.ll @@ -1,69 +1,74 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; DAGCombiner will transform: ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) ; unless isFabsFree returns true -; GCN-LABEL: {{^}}fabs_free_f16: +; GCN-LABEL: {{^}}s_fabs_free_f16: ; GCN: flat_load_ushort [[VAL:v[0-9]+]], ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]] ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fabs_free_f16(half addrspace(1)* %out, i16 %in) { +define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc= bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) store half %fabs, half addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}fabs_f16: +; GCN-LABEL: {{^}}s_fabs_f16: ; CI: flat_load_ushort [[VAL:v[0-9]+]], -; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]] -; CI: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], |[[CVT0]]| +; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fabs_f16(half addrspace(1)* %out, half %in) { +define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) store half %fabs, half addrspace(1)* %out ret void } ; FIXME: Should be able to use single and -; GCN-LABEL: {{^}}fabs_v2f16: -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| +; GCN-LABEL: {{^}}s_fabs_v2f16: +; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_or_b32_e32 -; VI: flat_load_ushort [[LO:v[0-9]+]] ; VI: flat_load_ushort [[HI:v[0-9]+]] +; VI: flat_load_ushort [[LO:v[0-9]+]] ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[LO]] ; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]] -; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, -; VI: v_or_b32 +; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[MASK]], [[LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]] ; VI: flat_store_dword -define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { + +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff +define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) store <2 x half> %fabs, <2 x half> addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}fabs_v4f16: -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| +; GCN-LABEL: {{^}}s_fabs_v4f16: +; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: flat_store_dwordx2 -define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { +define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) store <4 x half> %fabs, <4 x half> addrspace(1)* %out ret void @@ -72,22 +77,74 @@ define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { ; GCN-LABEL: {{^}}fabs_fold_f16: ; GCN: flat_load_ushort [[IN0:v[0-9]+]] ; GCN: flat_load_ushort [[IN1:v[0-9]+]] + ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]] -; CI-DAG: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], [[IN1]] -; CI: v_mul_f32_e64 [[RESULT:v[0-9]+]], |[[CVT1]]|, [[CVT0]] +; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]| +; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]] ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]] ; VI-NOT: and ; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { +define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { %fabs = call half @llvm.fabs.f16(half %in0) %fmul = fmul half %fabs, %in1 store half %fmul, half addrspace(1)* %out ret void } -declare half @llvm.fabs.f16(half) readnone -declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone -declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone +; GCN-LABEL: {{^}}v_fabs_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]] +define amdgpu_kernel void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fabs_free_v2f16: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff +define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { + %bc = bitcast i32 %in to <2 x half> + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc) + store <2 x half> %fabs, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fabs_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] + +; CI: v_cvt_f32_f16_e32 +; CI: v_cvt_f32_f16_e32 +; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} +; CI: v_cvt_f16_f32 +; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} +; CI: v_cvt_f16_f32 + +; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} +; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} + +; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}} +define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %fmul = fmul <2 x half> %fabs, %val + store <2 x half> %fmul, <2 x half> addrspace(1)* %out + ret void +} + +declare half @llvm.fabs.f16(half) #1 +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll index f7780b875ff5..998e02f7bdf8 100644 --- a/test/CodeGen/AMDGPU/fabs.f64.ll +++ b/test/CodeGen/AMDGPU/fabs.f64.ll @@ -10,7 +10,7 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone ; FUNC-LABEL: {{^}}v_fabs_f64: ; SI: v_and_b32 ; SI: s_endpgm -define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %tidext = sext i32 %tid to i64 %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext @@ -24,7 +24,7 @@ define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { ; SI: v_and_b32 ; SI-NOT: v_and_b32 ; SI: s_endpgm -define void @fabs_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) { %fabs = call double @llvm.fabs.f64(double %in) store double %fabs, double addrspace(1)* %out ret void @@ -34,7 +34,7 @@ define void @fabs_f64(double addrspace(1)* %out, double %in) { ; SI: v_and_b32 ; SI: v_and_b32 ; SI: s_endpgm -define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) store <2 x double> %fabs, <2 x double> addrspace(1)* %out ret void @@ -46,7 +46,7 @@ define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { ; SI: v_and_b32 ; SI: v_and_b32 ; SI: s_endpgm -define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) store <4 x double> %fabs, <4 x double> addrspace(1)* %out ret void @@ -57,7 +57,7 @@ define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { ; SI-NOT: and ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]| ; SI: s_endpgm -define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { +define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { %fabs = call double @llvm.fabs.f64(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, double addrspace(1)* %out @@ -69,7 +69,7 @@ define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) ; SI-NOT: and ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]| ; SI: s_endpgm -define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { +define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { %fabs = call double @fabs(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, double addrspace(1)* %out @@ -79,7 +79,7 @@ define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in ; FUNC-LABEL: {{^}}fabs_free_f64: ; SI: v_and_b32 ; SI: s_endpgm -define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { %bc= bitcast i64 %in to double %fabs = call double @llvm.fabs.f64(double %bc) store double %fabs, double addrspace(1)* %out @@ -89,7 +89,7 @@ define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { ; FUNC-LABEL: {{^}}fabs_fn_free_f64: ; SI: v_and_b32 ; SI: s_endpgm -define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { %bc= bitcast i64 %in to double %fabs = call double @fabs(double %bc) store double %fabs, double addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll index 98e7f9e3e9ad..ac8fa3e45ef5 100644 --- a/test/CodeGen/AMDGPU/fabs.ll +++ b/test/CodeGen/AMDGPU/fabs.ll @@ -13,7 +13,7 @@ ; GCN: v_and_b32 -define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fabs_fn_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @fabs(float %bc) store float %fabs, float addrspace(1)* %out @@ -26,7 +26,7 @@ define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) { ; GCN: v_and_b32 -define void @fabs_free(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fabs_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) store float %fabs, float addrspace(1)* %out @@ -37,7 +37,7 @@ define void @fabs_free(float addrspace(1)* %out, i32 %in) { ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; GCN: v_and_b32 -define void @fabs_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) store float %fabs, float addrspace(1)* %out ret void @@ -49,7 +49,7 @@ define void @fabs_f32(float addrspace(1)* %out, float %in) { ; GCN: v_and_b32 ; GCN: v_and_b32 -define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, <2 x float> addrspace(1)* %out ret void @@ -65,7 +65,7 @@ define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { ; GCN: v_and_b32 ; GCN: v_and_b32 ; GCN: v_and_b32 -define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) store <4 x float> %fabs, <4 x float> addrspace(1)* %out ret void @@ -76,7 +76,7 @@ define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: and ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]| -define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { +define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @fabs(float %in0) %fmul = fmul float %fabs, %in1 store float %fmul, float addrspace(1)* %out @@ -88,7 +88,7 @@ define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: and ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]| -define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { +define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @llvm.fabs.f32(float %in0) %fmul = fmul float %fabs, %in1 store float %fmul, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll index b74bce76f79c..9edf55cbc69f 100644 --- a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -28,7 +28,7 @@ ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -55,7 +55,7 @@ define void @fast_add_fmuladd_fmul() #0 { ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]] ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]] ; GCN-FASTFMA: buffer_store_dword [[FMA1]] -define void @fast_sub_fmuladd_fmul() #0 { +define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -87,7 +87,7 @@ define void @fast_sub_fmuladd_fmul() #0 { ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul_multi_use_mul() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -120,7 +120,7 @@ define void @fast_add_fmuladd_fmul_multi_use_mul() #0 { ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -145,7 +145,7 @@ define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 { ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -170,7 +170,7 @@ define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 { ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_add_f32_e32 -define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 { +define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -205,7 +205,7 @@ define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 { ; GCN: buffer_store_dword [[MUL]] ; GCN: buffer_store_dword [[MAD]] -define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { +define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -241,7 +241,7 @@ define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]] ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_subrev_f32_e32 -define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 { +define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll index 9ca077564e2b..f76ecf58d905 100644 --- a/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/test/CodeGen/AMDGPU/fadd.f16.ll @@ -11,7 +11,7 @@ ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fadd_f16( +define amdgpu_kernel void @fadd_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -25,14 +25,13 @@ entry: ; GCN-LABEL: {{^}}fadd_f16_imm_a ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3c00{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fadd_f16_imm_a( +define amdgpu_kernel void @fadd_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -44,14 +43,13 @@ entry: ; GCN-LABEL: {{^}}fadd_f16_imm_b ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4000{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fadd_f16_imm_b( +define amdgpu_kernel void @fadd_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -61,27 +59,31 @@ entry: ret void } -; GCN-LABEL: {{^}}fadd_v2f16 +; GCN-LABEL: {{^}}fadd_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fadd_v2f16( +define amdgpu_kernel void @fadd_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -93,25 +95,26 @@ entry: ret void } -; GCN-LABEL: {{^}}fadd_v2f16_imm_a +; GCN-LABEL: {{^}}fadd_v2f16_imm_a: ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x3c00{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4000{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] +; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fadd_v2f16_imm_a( +define amdgpu_kernel void @fadd_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -121,25 +124,26 @@ entry: ret void } -; GCN-LABEL: {{^}}fadd_v2f16_imm_b +; GCN-LABEL: {{^}}fadd_v2f16_imm_b: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4000{{$}} -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x3c00{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]] -; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[A_F16_1]] +; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_1]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fadd_v2f16_imm_b( +define amdgpu_kernel void @fadd_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/fadd.ll b/test/CodeGen/AMDGPU/fadd.ll index 0f683f7bfa23..621a0de281db 100644 --- a/test/CodeGen/AMDGPU/fadd.ll +++ b/test/CodeGen/AMDGPU/fadd.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}fadd_f32: ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W ; SI: v_add_f32 -define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 { %add = fadd float %a, %b store float %add, float addrspace(1)* %out, align 4 ret void @@ -16,7 +16,7 @@ define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) { ; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y ; SI: v_add_f32 ; SI: v_add_f32 -define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { %add = fadd <2 x float> %a, %b store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8 ret void @@ -31,7 +31,7 @@ define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo ; SI: v_add_f32 ; SI: v_add_f32 ; SI: v_add_f32 -define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 @@ -57,8 +57,19 @@ define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1) ; SI: v_add_f32 ; SI: v_add_f32 ; SI: v_add_f32 -define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) { +define amdgpu_kernel void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 { %add = fadd <8 x float> %a, %b store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32 ret void } + +; FUNC-LABEL: {{^}}fadd_0_nsz_attr_f32: +; SI-NOT: v_add_f32 +define amdgpu_kernel void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 { + %add = fadd float %a, 0.0 + store float %add, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll index 6f0c9de8ebaf..7eb7747de215 100644 --- a/test/CodeGen/AMDGPU/fadd64.ll +++ b/test/CodeGen/AMDGPU/fadd64.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: {{^}}v_fadd_f64: ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} -define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -14,7 +14,7 @@ define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; CHECK-LABEL: {{^}}s_fadd_f64: ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) { +define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) { %r2 = fadd double %r0, %r1 store double %r2, double addrspace(1)* %out ret void @@ -24,7 +24,7 @@ define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) { ; CHECK: v_add_f64 ; CHECK: v_add_f64 ; CHECK: _store_dwordx4 -define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, +define amdgpu_kernel void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2) { %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 @@ -37,7 +37,7 @@ define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} ; CHECK: _store_dwordx4 -define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) { +define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) { %r2 = fadd <2 x double> %r0, %r1 store <2 x double> %r2, <2 x double> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index ad3992f4cd03..f2686a5582dc 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -1,11 +1,15 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +declare half @llvm.fabs.f16(half) #0 declare half @llvm.canonicalize.f16(half) #0 +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 +declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16: ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} ; GCN: buffer_store_short [[REG]] -define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %canonicalized = call half @llvm.canonicalize.f16(half %val) store half %canonicalized, half addrspace(1)* %out @@ -15,17 +19,51 @@ define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16: ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}} ; GCN: buffer_store_short [[REG]] -define void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 { +define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 { %val = bitcast i16 %val.arg to half %canonicalized = call half @llvm.canonicalize.f16(half %val) store half %canonicalized, half addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16: +; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}| +; GCN: buffer_store_short [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 { + %val = load half, half addrspace(1)* %out + %val.fabs = call half @llvm.fabs.f16(half %val) + %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs) + store half %canonicalized, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16: +; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}| +; GCN: buffer_store_short [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 { + %val = load half, half addrspace(1)* %out + %val.fabs = call half @llvm.fabs.f16(half %val) + %val.fabs.fneg = fsub half -0.0, %val.fabs + %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg) + store half %canonicalized, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16: +; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}} +; GCN: buffer_store_short [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 { + %val = load half, half addrspace(1)* %out + %val.fneg = fsub half -0.0, %val + %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) + store half %canonicalized, half addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0.0) store half %canonicalized, half addrspace(1)* %out ret void @@ -34,7 +72,7 @@ define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half -0.0) store half %canonicalized, half addrspace(1)* %out ret void @@ -43,7 +81,7 @@ define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 1.0) store half %canonicalized, half addrspace(1)* %out ret void @@ -52,7 +90,7 @@ define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half -1.0) store half %canonicalized, half addrspace(1)* %out ret void @@ -61,16 +99,16 @@ define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 16.0) store half %canonicalized, half addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_no_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, half addrspace(1)* %out ret void @@ -79,16 +117,16 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f16(half addrspace(1) ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, half addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_no_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, half addrspace(1)* %out ret void @@ -97,7 +135,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f16(half addrspace(1) ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, half addrspace(1)* %out ret void @@ -106,7 +144,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* % ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) store half %canonicalized, half addrspace(1)* %out ret void @@ -115,7 +153,7 @@ define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) store half %canonicalized, half addrspace(1)* %out ret void @@ -124,7 +162,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) store half %canonicalized, half addrspace(1)* %out ret void @@ -133,7 +171,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) store half %canonicalized, half addrspace(1)* %out ret void @@ -142,7 +180,7 @@ define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) store half %canonicalized, half addrspace(1)* %out ret void @@ -151,7 +189,7 @@ define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) store half %canonicalized, half addrspace(1)* %out ret void @@ -160,13 +198,244 @@ define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} ; GCN: buffer_store_short [[REG]] -define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) store half %canonicalized, half addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16: +; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}} +; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}} +; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI-NOT: v_and_b32 + +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}} +; GFX9: buffer_store_dword [[REG]] +define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { + %val = load <2 x half>, <2 x half> addrspace(1)* %out + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; FIXME: Fold modifier +; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16: +; VI-DAG: v_bfe_u32 +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}} +; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI-NOT: 0xffff +; VI: v_or_b32 + +; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}} +; GCN: buffer_store_dword +define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { + %val = load <2 x half>, <2 x half> addrspace(1)* %out + %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16: +; VI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} +; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI: v_or_b32 + +; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}} +; GCN: buffer_store_dword +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { + %val = load <2 x half>, <2 x half> addrspace(1)* %out + %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %val.fabs.fneg = fsub <2 x half> , %val.fabs + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; FIXME: Fold modifier +; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16: +; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} +; VI-DAG: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]] +; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]] +; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]] +; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI-NOT: 0xffff + +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}} +; GFX9: buffer_store_dword [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { + %val = load <2 x half>, <2 x half> addrspace(1)* %out + %fneg.val = fsub <2 x half> , %val + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16: +; VI: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}} +; VI-DAG: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}} +; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI-NOT: v_and_b32 + +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}} +; GFX9: buffer_store_dword [[REG]] +define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 { + %val = bitcast i32 %val.arg to <2 x half> + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>)) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "target-features"="-flat-for-global" } -attributes #2 = { nounwind "target-features"="-flat-for-global,-fp16-denormals,-fp16-denormals" } -attributes #3 = { nounwind "target-features"="-flat-for-global,+fp16-denormals,+fp64-denormals" } +attributes #1 = { nounwind } +attributes #2 = { nounwind "target-features"="-fp64-fp16-denormals" } +attributes #3 = { nounwind "target-features"="+fp64-fp16-denormals" } diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll index 981d88dfe94e..8c385f40b1c5 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -1,12 +1,14 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +declare float @llvm.fabs.f32(float) #0 declare float @llvm.canonicalize.f32(float) #0 +declare double @llvm.fabs.f64(double) #0 declare double @llvm.canonicalize.f64(double) #0 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32: ; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} ; GCN: buffer_store_dword [[REG]] -define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %canonicalized = call float @llvm.canonicalize.f32(float %val) store float %canonicalized, float addrspace(1)* %out @@ -16,16 +18,50 @@ define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32: ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}} ; GCN: buffer_store_dword [[REG]] -define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 { +define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 { %canonicalized = call float @llvm.canonicalize.f32(float %val) store float %canonicalized, float addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32: +; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}| +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 { + %val = load float, float addrspace(1)* %out + %val.fabs = call float @llvm.fabs.f32(float %val) + %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs) + store float %canonicalized, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32: +; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}| +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 { + %val = load float, float addrspace(1)* %out + %val.fabs = call float @llvm.fabs.f32(float %val) + %val.fabs.fneg = fsub float -0.0, %val.fabs + %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs.fneg) + store float %canonicalized, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32: +; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 { + %val = load float, float addrspace(1)* %out + %val.fneg = fsub float -0.0, %val + %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg) + store float %canonicalized, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 0.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -34,7 +70,7 @@ define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32: ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float -0.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -43,7 +79,7 @@ define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 1.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -52,7 +88,7 @@ define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float -1.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -61,7 +97,7 @@ define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 16.0) store float %canonicalized, float addrspace(1)* %out ret void @@ -70,7 +106,7 @@ define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -79,7 +115,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -88,7 +124,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -97,7 +133,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -106,7 +142,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000) store float %canonicalized, float addrspace(1)* %out ret void @@ -115,7 +151,7 @@ define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -124,7 +160,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -133,7 +169,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -142,7 +178,7 @@ define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -151,7 +187,7 @@ define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -160,7 +196,7 @@ define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} ; GCN: buffer_store_dword [[REG]] -define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float)) store float %canonicalized, float addrspace(1)* %out ret void @@ -169,7 +205,7 @@ define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64: ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %canonicalized = call double @llvm.canonicalize.f64(double %val) store double %canonicalized, double addrspace(1)* %out @@ -179,17 +215,51 @@ define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64: ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 { +define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 { %canonicalized = call double @llvm.canonicalize.f64(double %val) store double %canonicalized, double addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64: +; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}| +; GCN: buffer_store_dwordx2 [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 { + %val = load double, double addrspace(1)* %out + %val.fabs = call double @llvm.fabs.f64(double %val) + %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs) + store double %canonicalized, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64: +; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}| +; GCN: buffer_store_dwordx2 [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 { + %val = load double, double addrspace(1)* %out + %val.fabs = call double @llvm.fabs.f64(double %val) + %val.fabs.fneg = fsub double -0.0, %val.fabs + %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs.fneg) + store double %canonicalized, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64: +; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}} +; GCN: buffer_store_dwordx2 [[REG]] +define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 { + %val = load double, double addrspace(1)* %out + %val.fneg = fsub double -0.0, %val + %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg) + store double %canonicalized, double addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64: ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 0.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -199,7 +269,7 @@ define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 { ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double -0.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -209,7 +279,7 @@ define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 { ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 1.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -219,7 +289,7 @@ define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 { ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double -1.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -229,7 +299,7 @@ define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 { ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 16.0) store double %canonicalized, double addrspace(1)* %out ret void @@ -239,7 +309,7 @@ define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 { ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -249,7 +319,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace( ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -259,7 +329,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 { +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -269,7 +339,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace( ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 { +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -279,7 +349,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000) store double %canonicalized, double addrspace(1)* %out ret void @@ -289,7 +359,7 @@ define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 { ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -299,7 +369,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %ou ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -309,7 +379,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %ou ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -319,7 +389,7 @@ define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) # ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -329,7 +399,7 @@ define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) # ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -339,7 +409,7 @@ define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) # ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double)) store double %canonicalized, double addrspace(1)* %out ret void @@ -347,5 +417,5 @@ define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) # attributes #0 = { nounwind readnone } attributes #1 = { nounwind } -attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" } -attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" } +attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" } +attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" } diff --git a/test/CodeGen/AMDGPU/fceil.ll b/test/CodeGen/AMDGPU/fceil.ll index efdda78f852b..0b913fda8580 100644 --- a/test/CodeGen/AMDGPU/fceil.ll +++ b/test/CodeGen/AMDGPU/fceil.ll @@ -13,7 +13,7 @@ declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone ; SI: v_ceil_f32_e32 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @fceil_f32(float addrspace(1)* %out, float %x) { %y = call float @llvm.ceil.f32(float %x) nounwind readnone store float %y, float addrspace(1)* %out ret void @@ -25,7 +25,7 @@ define void @fceil_f32(float addrspace(1)* %out, float %x) { ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} ; EG: CEIL {{\*? *}}[[RESULT]] ; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { +define amdgpu_kernel void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone store <2 x float> %y, <2 x float> addrspace(1)* %out ret void @@ -41,7 +41,7 @@ define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { ; EG-DAG: CEIL {{\*? *}}[[RESULT1]] ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { +define amdgpu_kernel void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone store <3 x float> %y, <3 x float> addrspace(1)* %out ret void @@ -57,7 +57,7 @@ define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { ; EG: CEIL {{\*? *}}[[RESULT]] ; EG: CEIL {{\*? *}}[[RESULT]] ; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { +define amdgpu_kernel void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone store <4 x float> %y, <4 x float> addrspace(1)* %out ret void @@ -82,7 +82,7 @@ define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] ; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { +define amdgpu_kernel void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone store <8 x float> %y, <8 x float> addrspace(1)* %out ret void @@ -125,7 +125,7 @@ define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { ; EG-DAG: CEIL {{\*? *}}[[RESULT4]] ; EG-DAG: CEIL {{\*? *}}[[RESULT4]] ; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { +define amdgpu_kernel void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone store <16 x float> %y, <16 x float> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll index 98448db5dd24..61572a855620 100644 --- a/test/CodeGen/AMDGPU/fceil64.ll +++ b/test/CodeGen/AMDGPU/fceil64.ll @@ -31,7 +31,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone ; SI: v_cndmask_b32 ; SI: v_add_f64 ; SI: s_endpgm -define void @fceil_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) { %y = call double @llvm.ceil.f64(double %x) nounwind readnone store double %y, double addrspace(1)* %out ret void @@ -40,7 +40,7 @@ define void @fceil_f64(double addrspace(1)* %out, double %x) { ; FUNC-LABEL: {{^}}fceil_v2f64: ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 -define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { +define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone store <2 x double> %y, <2 x double> addrspace(1)* %out ret void @@ -50,7 +50,7 @@ define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { ; FIXME-CI: v_ceil_f64_e32 ; FIXME-CI: v_ceil_f64_e32 ; FIXME-CI: v_ceil_f64_e32 -; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; define amdgpu_kernel void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { ; %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone ; store <3 x double> %y, <3 x double> addrspace(1)* %out ; ret void @@ -61,7 +61,7 @@ define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 -define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { +define amdgpu_kernel void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone store <4 x double> %y, <4 x double> addrspace(1)* %out ret void @@ -76,7 +76,7 @@ define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 -define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { +define amdgpu_kernel void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone store <8 x double> %y, <8 x double> addrspace(1)* %out ret void @@ -99,7 +99,7 @@ define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 -define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { +define amdgpu_kernel void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone store <16 x double> %y, <16 x double> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/fcmp-cnd.ll b/test/CodeGen/AMDGPU/fcmp-cnd.ll index 530274f920f0..7f8be804309e 100644 --- a/test/CodeGen/AMDGPU/fcmp-cnd.ll +++ b/test/CodeGen/AMDGPU/fcmp-cnd.ll @@ -4,7 +4,7 @@ ;registers and literal.x depending on what the optimizer does. ;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { entry: %0 = load float, float addrspace(1)* %in %cmp = fcmp oeq float %0, 0.000000e+00 diff --git a/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll index c402805feb39..2a848e80b81b 100644 --- a/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll +++ b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll @@ -6,7 +6,7 @@ ; CHECK: SET{{[A-Z]+}}_DX10 -define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { entry: %0 = load float, float addrspace(1)* %in %cmp = fcmp oeq float %0, 0.000000e+00 diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll index a62726f7f068..7916226462f7 100644 --- a/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -11,7 +11,7 @@ ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_lt( +define amdgpu_kernel void @fcmp_f16_lt( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -28,16 +28,16 @@ entry: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]| +; SI: v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]| -; SI: v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F32]]|, |v[[B_F32]]| +; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] ; VI: v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]| ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_lt_abs( +define amdgpu_kernel void @fcmp_f16_lt_abs( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -62,7 +62,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_eq( +define amdgpu_kernel void @fcmp_f16_eq( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -85,7 +85,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_le( +define amdgpu_kernel void @fcmp_f16_le( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -108,7 +108,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_gt( +define amdgpu_kernel void @fcmp_f16_gt( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -131,7 +131,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_lg( +define amdgpu_kernel void @fcmp_f16_lg( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -154,7 +154,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_ge( +define amdgpu_kernel void @fcmp_f16_ge( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -177,7 +177,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_o( +define amdgpu_kernel void @fcmp_f16_o( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -200,7 +200,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_u( +define amdgpu_kernel void @fcmp_f16_u( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -223,7 +223,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_nge( +define amdgpu_kernel void @fcmp_f16_nge( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -246,7 +246,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_nlg( +define amdgpu_kernel void @fcmp_f16_nlg( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -269,7 +269,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_ngt( +define amdgpu_kernel void @fcmp_f16_ngt( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -292,7 +292,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_nle( +define amdgpu_kernel void @fcmp_f16_nle( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -315,7 +315,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_neq( +define amdgpu_kernel void @fcmp_f16_neq( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -338,7 +338,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fcmp_f16_nlt( +define amdgpu_kernel void @fcmp_f16_nlt( i32 addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -368,7 +368,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_lt( +define amdgpu_kernel void @fcmp_v2f16_lt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -398,7 +398,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_eq( +define amdgpu_kernel void @fcmp_v2f16_eq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -428,7 +428,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_le( +define amdgpu_kernel void @fcmp_v2f16_le( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -458,7 +458,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_gt( +define amdgpu_kernel void @fcmp_v2f16_gt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -488,7 +488,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_lg( +define amdgpu_kernel void @fcmp_v2f16_lg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -518,7 +518,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_ge( +define amdgpu_kernel void @fcmp_v2f16_ge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -548,7 +548,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_o( +define amdgpu_kernel void @fcmp_v2f16_o( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -578,7 +578,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_u( +define amdgpu_kernel void @fcmp_v2f16_u( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -608,7 +608,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_nge( +define amdgpu_kernel void @fcmp_v2f16_nge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -638,7 +638,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_nlg( +define amdgpu_kernel void @fcmp_v2f16_nlg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -668,7 +668,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_ngt( +define amdgpu_kernel void @fcmp_v2f16_ngt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -698,7 +698,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_nle( +define amdgpu_kernel void @fcmp_v2f16_nle( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -728,7 +728,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_neq( +define amdgpu_kernel void @fcmp_v2f16_neq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -758,7 +758,7 @@ entry: ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm -define void @fcmp_v2f16_nlt( +define amdgpu_kernel void @fcmp_v2f16_nlt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { diff --git a/test/CodeGen/AMDGPU/fcmp.ll b/test/CodeGen/AMDGPU/fcmp.ll index 97d954fcc3c2..b548670edb06 100644 --- a/test/CodeGen/AMDGPU/fcmp.ll +++ b/test/CodeGen/AMDGPU/fcmp.ll @@ -3,7 +3,7 @@ ; CHECK: {{^}}fcmp_sext: ; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) { entry: %0 = load float, float addrspace(1)* %in %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1 @@ -22,7 +22,7 @@ entry: ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}} ; CHECK-NEXT: {{[0-9]+\(5.0}} -define void @fcmp_br(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_br(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oeq float %in, 5.0 br i1 %0, label %IF, label %ENDIF diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll index acce82fdfe53..b9e1921d4c45 100644 --- a/test/CodeGen/AMDGPU/fcmp64.ll +++ b/test/CodeGen/AMDGPU/fcmp64.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: {{^}}flt_f64: ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -15,7 +15,7 @@ define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, ; CHECK-LABEL: {{^}}fle_f64: ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -27,7 +27,7 @@ define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, ; CHECK-LABEL: {{^}}fgt_f64: ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -39,7 +39,7 @@ define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, ; CHECK-LABEL: {{^}}fge_f64: ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -51,7 +51,7 @@ define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, ; CHECK-LABEL: {{^}}fne_f64: ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -63,7 +63,7 @@ define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; CHECK-LABEL: {{^}}feq_f64: ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll index 89af37545c99..125597796245 100644 --- a/test/CodeGen/AMDGPU/fconst64.ll +++ b/test/CodeGen/AMDGPU/fconst64.ll @@ -5,7 +5,7 @@ ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0 -define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { %r1 = load double, double addrspace(1)* %in %r2 = fadd double %r1, 5.000000e+00 store double %r2, double addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fcopysign.f16.ll b/test/CodeGen/AMDGPU/fcopysign.f16.ll new file mode 100644 index 000000000000..4e2bf765cd95 --- /dev/null +++ b/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -0,0 +1,264 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s + +declare half @llvm.copysign.f16(half, half) +declare float @llvm.copysign.f32(float, float) +declare double @llvm.copysign.f64(double, double) +declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) +declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) +declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) + +; GCN-LABEL: {{^}}test_copysign_f16: +; SI: buffer_load_ushort v[[SIGN:[0-9]+]] +; SI: buffer_load_ushort v[[MAG:[0-9]+]] +; SI: s_brev_b32 s[[CONST:[0-9]+]], -2 +; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] +; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] +; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]] +; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] +; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]] +; GFX89: buffer_load_ushort v[[MAG:[0-9]+]] +; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff +; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]] +; GCN: buffer_store_short v[[OUT]] +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_f16( + half addrspace(1)* %arg_out, + half addrspace(1)* %arg_mag, + half addrspace(1)* %arg_sign) { +entry: + %mag = load half, half addrspace(1)* %arg_mag + %sign = load half, half addrspace(1)* %arg_sign + %out = call half @llvm.copysign.f16(half %mag, half %sign) + store half %out, half addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32: +; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] +; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 +; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] +; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]] +; GCN: buffer_store_dword v[[OUT]] +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32( + float addrspace(1)* %arg_out, + half addrspace(1)* %arg_mag, + float addrspace(1)* %arg_sign) { +entry: + %mag = load half, half addrspace(1)* %arg_mag + %mag.ext = fpext half %mag to float + %sign = load float, float addrspace(1)* %arg_sign + %out = call float @llvm.copysign.f32(float %mag.ext, float %sign) + store float %out, float addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64: +; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} +; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 +; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]] +; GCN: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_EXT_HI]], v[[SIGN_HI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_EXT_LO]]:[[OUT_HI]]{{\]}} +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64( + double addrspace(1)* %arg_out, + half addrspace(1)* %arg_mag, + double addrspace(1)* %arg_sign) { +entry: + %mag = load half, half addrspace(1)* %arg_mag + %mag.ext = fpext half %mag to double + %sign = load double, double addrspace(1)* %arg_sign + %out = call double @llvm.copysign.f64(double %mag.ext, double %sign) + store double %out, double addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16: +; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] +; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 +; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] +; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]] +; GFX89-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]] +; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]] +; GCN: buffer_store_dword v[[OUT]] +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16( + float addrspace(1)* %arg_out, + float addrspace(1)* %arg_mag, + half addrspace(1)* %arg_sign) { +entry: + %mag = load float, float addrspace(1)* %arg_mag + %sign = load half, half addrspace(1)* %arg_sign + %sign.ext = fpext half %sign to float + %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) + store float %out, float addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16: +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}} +; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 +; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] +; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]] +; GFX89-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]] +; GFX89: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]] +; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_LO]]:[[OUT_HI]]{{\]}} +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16( + double addrspace(1)* %arg_out, + double addrspace(1)* %arg_mag, + half addrspace(1)* %arg_sign) { +entry: + %mag = load double, double addrspace(1)* %arg_mag + %sign = load half, half addrspace(1)* %arg_sign + %sign.ext = fpext half %sign to double + %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) + store double %out, double addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32: +; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] +; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 +; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] +; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]] +; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] +; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff +; GFX89-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]] +; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]] +; GCN: buffer_store_short v[[OUT]] +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32( + half addrspace(1)* %arg_out, + half addrspace(1)* %arg_mag, + float addrspace(1)* %arg_sign) { +entry: + %mag = load half, half addrspace(1)* %arg_mag + %sign = load float, float addrspace(1)* %arg_sign + %sign.trunc = fptrunc float %sign to half + %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) + store half %out, half addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64: +; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} +; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 +; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] +; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]] +; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] +; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff +; GFX89-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN_HI]] +; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]] +; GCN: buffer_store_short v[[OUT]] +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64( + half addrspace(1)* %arg_out, + half addrspace(1)* %arg_mag, + double addrspace(1)* %arg_sign) { +entry: + %mag = load half, half addrspace(1)* %arg_mag + %sign = load double, double addrspace(1)* %arg_sign + %sign.trunc = fptrunc double %sign to half + %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) + store half %out, half addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16: +; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] +; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 +; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]] +; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] +; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG_TRUNC]] +; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]] +; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] +; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff +; GFX89-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]] +; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_TRUNC]], v[[SIGN]] +; GCN: buffer_store_short v[[OUT]] +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16( + half addrspace(1)* %arg_out, + float addrspace(1)* %arg_mag, + half addrspace(1)* %arg_sign) { +entry: + %mag = load float, float addrspace(1)* %arg_mag + %mag.trunc = fptrunc float %mag to half + %sign = load half, half addrspace(1)* %arg_sign + %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign) + store half %out, half addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16: +; GCN: v_bfi_b32 +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_out_f16_mag_f64_sign_f16( + half addrspace(1)* %arg_out, + double addrspace(1)* %arg_mag, + half addrspace(1)* %arg_sign) { +entry: + %mag = load double, double addrspace(1)* %arg_mag + %mag.trunc = fptrunc double %mag to half + %sign = load half, half addrspace(1)* %arg_sign + %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign) + store half %out, half addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_v2f16: +; GCN: v_bfi_b32 +; GCN: v_bfi_b32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_v2f16( + <2 x half> addrspace(1)* %arg_out, + <2 x half> %arg_mag, + <2 x half> %arg_sign) { +entry: + %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign) + store <2 x half> %out, <2 x half> addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_v3f16: +; GCN: v_bfi_b32 +; GCN: v_bfi_b32 +; GCN: v_bfi_b32 +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_v3f16( + <3 x half> addrspace(1)* %arg_out, + <3 x half> %arg_mag, + <3 x half> %arg_sign) { +entry: + %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign) + store <3 x half> %out, <3 x half> addrspace(1)* %arg_out + ret void +} + +; GCN-LABEL: {{^}}test_copysign_v4f16: +; GCN: v_bfi_b32 +; GCN: v_bfi_b32 +; GCN: v_bfi_b32 +; GCN: v_bfi_b32 +; GCN: s_endpgm +define amdgpu_kernel void @test_copysign_v4f16( + <4 x half> addrspace(1)* %arg_out, + <4 x half> %arg_mag, + <4 x half> %arg_sign) { +entry: + %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) + store <4 x half> %out, <4 x half> addrspace(1)* %arg_out + ret void +} diff --git a/test/CodeGen/AMDGPU/fcopysign.f32.ll b/test/CodeGen/AMDGPU/fcopysign.f32.ll index 632de18dafcb..e5893e5995a3 100644 --- a/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -20,7 +20,7 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read ; GCN: s_endpgm ; EG: BFI_INT -define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind { +define amdgpu_kernel void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind { %result = call float @llvm.copysign.f32(float %mag, float %sign) store float %result, float addrspace(1)* %out, align 4 ret void @@ -31,7 +31,7 @@ define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign ; EG: BFI_INT ; EG: BFI_INT -define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind { +define amdgpu_kernel void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind { %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 ret void @@ -44,7 +44,7 @@ define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %ma ; EG: BFI_INT ; EG: BFI_INT ; EG: BFI_INT -define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind { +define amdgpu_kernel void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind { %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 ret void diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll index 12c942beee6c..67779a8ff3b9 100644 --- a/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -17,7 +17,7 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} ; GCN: s_endpgm -define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { +define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { %result = call double @llvm.copysign.f64(double %mag, double %sign) store double %result, double addrspace(1)* %out, align 8 ret void @@ -32,7 +32,7 @@ define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %s ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]] ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} -define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind { +define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind { %c = fpext float %sign to double %result = call double @llvm.copysign.f64(double %mag, double %c) store double %result, double addrspace(1)* %out, align 8 @@ -41,7 +41,7 @@ define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float ; FUNC-LABEL: {{^}}test_copysign_v2f64: ; GCN: s_endpgm -define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind { +define amdgpu_kernel void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind { %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8 ret void @@ -49,7 +49,7 @@ define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> % ; FUNC-LABEL: {{^}}test_copysign_v4f64: ; GCN: s_endpgm -define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind { +define amdgpu_kernel void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind { %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8 ret void diff --git a/test/CodeGen/AMDGPU/fdiv.f16.ll b/test/CodeGen/AMDGPU/fdiv.f16.ll index 70b70bdaaaa7..7f84e973c958 100644 --- a/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -31,7 +31,7 @@ ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_fdiv_f16( +define amdgpu_kernel void @v_fdiv_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { @@ -54,7 +54,7 @@ entry: ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; VI-NOT: [[RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -72,7 +72,7 @@ entry: ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]| ; VI-NOT: [RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -91,7 +91,7 @@ entry: ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; VI-NOT: [[RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -109,7 +109,7 @@ entry: ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]] ; VI-NOT: [RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -127,7 +127,7 @@ entry: ; VI: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; VI-NOT: [RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -147,7 +147,7 @@ entry: ; VI-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]] ; VI-NOT: [RESULT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -168,7 +168,7 @@ entry: ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { +define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -190,7 +190,7 @@ entry: ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { +define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -204,6 +204,42 @@ entry: ret void } +; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16: +; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} + +; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} +; VI: buffer_store_short [[MUL]] +define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 { + %x = load half, half addrspace(1)* undef + %rcp = fdiv arcp half %x, 2.0 + store half %rcp, half addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16: +; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}} + +; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}} +; VI: buffer_store_short [[MUL]] +define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 { + %x = load half, half addrspace(1)* undef + %rcp = fdiv arcp half %x, 10.0 + store half %rcp, half addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16: +; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}} + +; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}} +; VI: buffer_store_short [[MUL]] +define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 { + %x = load half, half addrspace(1)* undef + %rcp = fdiv arcp half %x, -10.0 + store half %rcp, half addrspace(1)* %out, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare half @llvm.sqrt.f16(half) #1 declare half @llvm.fabs.f16(half) #1 diff --git a/test/CodeGen/AMDGPU/fdiv.f64.ll b/test/CodeGen/AMDGPU/fdiv.f64.ll index 20f9e4df07fd..d16bdf43ee26 100644 --- a/test/CodeGen/AMDGPU/fdiv.f64.ll +++ b/test/CodeGen/AMDGPU/fdiv.f64.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s -; COMMON-LABEL: {{^}}fdiv_f64: -; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 -; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-LABEL: {{^}}fdiv_f64: +; GCN-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 +; GCN-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8 ; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] ; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] @@ -13,23 +13,23 @@ ; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] ; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]] -; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]] +; GCN-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]] ; SI-DAG: v_cmp_eq_u32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}} ; SI-DAG: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}} ; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc -; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0 -; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]] -; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0 -; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]] -; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]] -; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]] -; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]] -; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]] -; COMMON: buffer_store_dwordx2 [[RESULT]] -; COMMON: s_endpgm -define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind { +; GCN-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0 +; GCN-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]] +; GCN-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0 +; GCN-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]] +; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]] +; GCN-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]] +; GCN: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]] +; GCN: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]] +; GCN: buffer_store_dwordx2 [[RESULT]] +; GCN: s_endpgm +define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1 %num = load volatile double, double addrspace(1)* %in %den = load volatile double, double addrspace(1)* %gep.1 @@ -38,31 +38,31 @@ define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounw ret void } -; COMMON-LABEL: {{^}}fdiv_f64_s_v: -define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind { +; GCN-LABEL: {{^}}fdiv_f64_s_v: +define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 { %den = load double, double addrspace(1)* %in %result = fdiv double %num, %den store double %result, double addrspace(1)* %out ret void } -; COMMON-LABEL: {{^}}fdiv_f64_v_s: -define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind { +; GCN-LABEL: {{^}}fdiv_f64_v_s: +define amdgpu_kernel void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 { %num = load double, double addrspace(1)* %in %result = fdiv double %num, %den store double %result, double addrspace(1)* %out ret void } -; COMMON-LABEL: {{^}}fdiv_f64_s_s: -define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind { +; GCN-LABEL: {{^}}fdiv_f64_s_s: +define amdgpu_kernel void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 { %result = fdiv double %num, %den store double %result, double addrspace(1)* %out ret void } -; COMMON-LABEL: {{^}}v_fdiv_v2f64: -define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind { +; GCN-LABEL: {{^}}v_fdiv_v2f64: +define amdgpu_kernel void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1 %num = load <2 x double>, <2 x double> addrspace(1)* %in %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1 @@ -71,15 +71,15 @@ define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac ret void } -; COMMON-LABEL: {{^}}s_fdiv_v2f64: -define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) { +; GCN-LABEL: {{^}}s_fdiv_v2f64: +define amdgpu_kernel void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) { %result = fdiv <2 x double> %num, %den store <2 x double> %result, <2 x double> addrspace(1)* %out ret void } -; COMMON-LABEL: {{^}}v_fdiv_v4f64: -define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind { +; GCN-LABEL: {{^}}v_fdiv_v4f64: +define amdgpu_kernel void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 %num = load <4 x double>, <4 x double> addrspace(1)* %in %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1 @@ -88,9 +88,46 @@ define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspac ret void } -; COMMON-LABEL: {{^}}s_fdiv_v4f64: -define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) { +; GCN-LABEL: {{^}}s_fdiv_v4f64: +define amdgpu_kernel void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 { %result = fdiv <4 x double> %num, %den store <4 x double> %result, <4 x double> addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}div_fast_2_x_pat_f64: +; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 0.5 +; GCN: buffer_store_dwordx2 [[MUL]] +define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 { + %x = load double, double addrspace(1)* undef + %rcp = fdiv fast double %x, 2.0 + store double %rcp, double addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}div_fast_k_x_pat_f64: +; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a +; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fb99999 +; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +; GCN: buffer_store_dwordx2 [[MUL]] +define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 { + %x = load double, double addrspace(1)* undef + %rcp = fdiv fast double %x, 10.0 + store double %rcp, double addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}div_fast_neg_k_x_pat_f64: +; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a +; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfb99999 +; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +; GCN: buffer_store_dwordx2 [[MUL]] +define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 { + %x = load double, double addrspace(1)* undef + %rcp = fdiv fast double %x, -10.0 + store double %rcp, double addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "unsafe-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll index 0e95de9c555c..b3a2b6643720 100644 --- a/test/CodeGen/AMDGPU/fdiv.ll +++ b/test/CodeGen/AMDGPU/fdiv.ll @@ -27,7 +27,7 @@ ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], -define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv float %a, %b store float %fdiv, float addrspace(1)* %out @@ -52,7 +52,7 @@ entry: ; GCN-NOT: s_setreg ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], -define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { +define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv float %a, %b store float %fdiv, float addrspace(1)* %out @@ -65,7 +65,7 @@ entry: ; GCN: v_rcp_f32 ; GCN: v_mul_f32 ; GCN: v_mul_f32 -define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv float %a, %b, !fpmath !0 store float %fdiv, float addrspace(1)* %out @@ -77,7 +77,7 @@ entry: ; GCN: v_fma_f32 ; GCN: v_div_fmas_f32 ; GCN: v_div_fixup_f32 -define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { +define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv float %a, %b, !fpmath !0 store float %fdiv, float addrspace(1)* %out @@ -89,7 +89,7 @@ entry: ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { +define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv fast float %a, %b store float %fdiv, float addrspace(1)* %out @@ -104,7 +104,7 @@ entry: ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv fast float %a, %b store float %fdiv, float addrspace(1)* %out @@ -119,7 +119,7 @@ entry: ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: %fdiv = fdiv arcp float %a, %b store float %fdiv, float addrspace(1)* %out @@ -136,7 +136,7 @@ entry: ; GCN: v_div_scale_f32 ; GCN: v_div_scale_f32 ; GCN: v_div_scale_f32 -define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv <2 x float> %a, %b store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -146,7 +146,7 @@ entry: ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: ; GCN: v_cmp_gt_f32 ; GCN: v_cmp_gt_f32 -define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -161,7 +161,7 @@ entry: ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 -define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv fast <2 x float> %a, %b store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -176,7 +176,7 @@ entry: ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 -define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv arcp <2 x float> %a, %b store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -197,7 +197,7 @@ entry: ; GCN: v_div_fixup_f32 ; GCN: v_div_fixup_f32 ; GCN: v_div_fixup_f32 -define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -220,7 +220,7 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1) ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 -define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -243,7 +243,7 @@ define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> ad ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 ; GCN: v_rcp_f32 -define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr diff --git a/test/CodeGen/AMDGPU/ffloor.f64.ll b/test/CodeGen/AMDGPU/ffloor.f64.ll index 83ffbdfa23a5..407cccb8443e 100644 --- a/test/CodeGen/AMDGPU/ffloor.f64.ll +++ b/test/CodeGen/AMDGPU/ffloor.f64.ll @@ -19,7 +19,7 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone ; SI: v_cndmask_b32_e32 ; SI: v_add_f64 ; SI: s_endpgm -define void @ffloor_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @ffloor_f64(double addrspace(1)* %out, double %x) { %y = call double @llvm.floor.f64(double %x) nounwind readnone store double %y, double addrspace(1)* %out ret void @@ -34,7 +34,7 @@ define void @ffloor_f64(double addrspace(1)* %out, double %x) { ; SI: v_cndmask_b32_e32 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]] ; SI: s_endpgm -define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @ffloor_f64_neg(double addrspace(1)* %out, double %x) { %neg = fsub double 0.0, %x %y = call double @llvm.floor.f64(double %neg) nounwind readnone store double %y, double addrspace(1)* %out @@ -50,7 +50,7 @@ define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) { ; SI: v_cndmask_b32_e32 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]| ; SI: s_endpgm -define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) { %abs = call double @llvm.fabs.f64(double %x) %neg = fsub double 0.0, %abs %y = call double @llvm.floor.f64(double %neg) nounwind readnone @@ -61,7 +61,7 @@ define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) { ; FUNC-LABEL: {{^}}ffloor_v2f64: ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 -define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { +define amdgpu_kernel void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone store <2 x double> %y, <2 x double> addrspace(1)* %out ret void @@ -72,7 +72,7 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 ; CI-NOT: v_floor_f64_e32 -define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +define amdgpu_kernel void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone store <3 x double> %y, <3 x double> addrspace(1)* %out ret void @@ -83,7 +83,7 @@ define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 -define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { +define amdgpu_kernel void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone store <4 x double> %y, <4 x double> addrspace(1)* %out ret void @@ -98,7 +98,7 @@ define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 -define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { +define amdgpu_kernel void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone store <8 x double> %y, <8 x double> addrspace(1)* %out ret void @@ -121,7 +121,7 @@ define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 ; CI: v_floor_f64_e32 -define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { +define amdgpu_kernel void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone store <16 x double> %y, <16 x double> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/ffloor.ll b/test/CodeGen/AMDGPU/ffloor.ll index d7f35a45075c..720fe7a45e3d 100644 --- a/test/CodeGen/AMDGPU/ffloor.ll +++ b/test/CodeGen/AMDGPU/ffloor.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}floor_f32: ; SI: v_floor_f32_e32 ; R600: FLOOR -define void @floor_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @floor_f32(float addrspace(1)* %out, float %in) { %tmp = call float @llvm.floor.f32(float %in) #0 store float %tmp, float addrspace(1)* %out ret void @@ -15,7 +15,7 @@ define void @floor_f32(float addrspace(1)* %out, float %in) { ; SI: v_floor_f32_e32 ; SI: v_floor_f32_e32 -define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0 store <2 x float> %tmp, <2 x float> addrspace(1)* %out ret void @@ -31,7 +31,7 @@ define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { ; R600: FLOOR ; R600: FLOOR ; R600: FLOOR -define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0 store <4 x float> %tmp, <4 x float> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/fix-vgpr-copies.mir b/test/CodeGen/AMDGPU/fix-vgpr-copies.mir new file mode 100644 index 000000000000..4951e0df4d3e --- /dev/null +++ b/test/CodeGen/AMDGPU/fix-vgpr-copies.mir @@ -0,0 +1,44 @@ +# RUN: llc -march=amdgcn -start-after=greedy -stop-after=si-optimize-exec-masking -o - %s | FileCheck %s +# Check that we first do all vector instructions and only then change exec +# CHECK-DAG: COPY %vgpr10_vgpr11 +# CHECK-DAG: COPY %vgpr12_vgpr13 +# CHECK: %exec = COPY + +--- +name: main +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%sgpr4_sgpr5' } + - { reg: '%sgpr6' } + - { reg: '%vgpr0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %vgpr3, %vgpr10_vgpr11, %vgpr12_vgpr13 + + %vcc = V_CMP_NE_U32_e64 0, killed %vgpr3, implicit %exec + %sgpr4_sgpr5 = COPY %exec, implicit-def %exec + %sgpr6_sgpr7 = S_AND_B64 %sgpr4_sgpr5, killed %vcc, implicit-def dead %scc + %sgpr4_sgpr5 = S_XOR_B64 %sgpr6_sgpr7, killed %sgpr4_sgpr5, implicit-def dead %scc + %vgpr61_vgpr62 = COPY %vgpr10_vgpr11 + %vgpr155_vgpr156 = COPY %vgpr12_vgpr13 + %exec = S_MOV_B64_term killed %sgpr6_sgpr7 +... diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll index 55b5482d031f..c867e4fca229 100644 --- a/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/test/CodeGen/AMDGPU/flat-address-space.ll @@ -17,43 +17,43 @@ ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]] -define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { +define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - store i32 %x, i32 addrspace(4)* %fptr, align 4 + store volatile i32 %x, i32 addrspace(4)* %fptr, align 4 ret void } ; CHECK-LABEL: {{^}}store_flat_i64: ; CHECK: flat_store_dwordx2 -define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { +define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - store i64 %x, i64 addrspace(4)* %fptr, align 8 + store volatile i64 %x, i64 addrspace(4)* %fptr, align 8 ret void } ; CHECK-LABEL: {{^}}store_flat_v4i32: ; CHECK: flat_store_dwordx4 -define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { +define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 + store volatile <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 ret void } ; CHECK-LABEL: {{^}}store_flat_trunc_i16: ; CHECK: flat_store_short -define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { +define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* %y = trunc i32 %x to i16 - store i16 %y, i16 addrspace(4)* %fptr, align 2 + store volatile i16 %y, i16 addrspace(4)* %fptr, align 2 ret void } ; CHECK-LABEL: {{^}}store_flat_trunc_i8: ; CHECK: flat_store_byte -define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { +define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* %y = trunc i32 %x to i8 - store i8 %y, i8 addrspace(4)* %fptr, align 2 + store volatile i8 %y, i8 addrspace(4)* %fptr, align 2 ret void } @@ -61,36 +61,36 @@ define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { ; CHECK-LABEL: load_flat_i32: ; CHECK: flat_load_dword -define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - %fload = load i32, i32 addrspace(4)* %fptr, align 4 + %fload = load volatile i32, i32 addrspace(4)* %fptr, align 4 store i32 %fload, i32 addrspace(1)* %out, align 4 ret void } ; CHECK-LABEL: load_flat_i64: ; CHECK: flat_load_dwordx2 -define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - %fload = load i64, i64 addrspace(4)* %fptr, align 8 + %fload = load volatile i64, i64 addrspace(4)* %fptr, align 8 store i64 %fload, i64 addrspace(1)* %out, align 8 ret void } ; CHECK-LABEL: load_flat_v4i32: ; CHECK: flat_load_dwordx4 -define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32 + %fload = load volatile <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32 store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 ret void } ; CHECK-LABEL: sextload_flat_i8: ; CHECK: flat_load_sbyte -define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %fload = load i8, i8 addrspace(4)* %fptr, align 4 + %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4 %ext = sext i8 %fload to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -98,9 +98,9 @@ define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n ; CHECK-LABEL: zextload_flat_i8: ; CHECK: flat_load_ubyte -define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %fload = load i8, i8 addrspace(4)* %fptr, align 4 + %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4 %ext = zext i8 %fload to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -108,9 +108,9 @@ define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n ; CHECK-LABEL: sextload_flat_i16: ; CHECK: flat_load_sshort -define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %fload = load i16, i16 addrspace(4)* %fptr, align 4 + %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4 %ext = sext i16 %fload to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -118,9 +118,9 @@ define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* ; CHECK-LABEL: zextload_flat_i16: ; CHECK: flat_load_ushort -define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { +define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %fload = load i16, i16 addrspace(4)* %fptr, align 4 + %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4 %ext = zext i16 %fload to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -131,7 +131,7 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* ; CHECK: flat_load_ubyte ; CHECK: flat_load_ubyte ; CHECK: flat_load_ubyte -define void @flat_scratch_unaligned_load() { +define amdgpu_kernel void @flat_scratch_unaligned_load() { %scratch = alloca i32 %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1 @@ -143,7 +143,7 @@ define void @flat_scratch_unaligned_load() { ; CHECK: flat_store_byte ; CHECK: flat_store_byte ; CHECK: flat_store_byte -define void @flat_scratch_unaligned_store() { +define amdgpu_kernel void @flat_scratch_unaligned_store() { %scratch = alloca i32 %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %fptr, align 1 @@ -154,7 +154,7 @@ define void @flat_scratch_unaligned_store() { ; HSA: flat_load_dword ; HSA: flat_load_dword ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr -define void @flat_scratch_multidword_load() { +define amdgpu_kernel void @flat_scratch_multidword_load() { %scratch = alloca <2 x i32> %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)* %ld = load volatile <2 x i32>, <2 x i32> addrspace(4)* %fptr @@ -165,7 +165,7 @@ define void @flat_scratch_multidword_load() { ; HSA: flat_store_dword ; HSA: flat_store_dword ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr -define void @flat_scratch_multidword_store() { +define amdgpu_kernel void @flat_scratch_multidword_store() { %scratch = alloca <2 x i32> %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)* store volatile <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %fptr diff --git a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll index df9ba00c6974..dac1500cd46c 100644 --- a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll +++ b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll @@ -23,7 +23,7 @@ ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword ; NOHSA-NOADDR64: flat_store_dword -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { entry: store i32 0, i32 addrspace(1)* %out ret void @@ -36,7 +36,7 @@ entry: ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword ; NOHSA-NOADDR64: flat_store_dword -define void @test_addr64(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test_addr64(i32 addrspace(1)* %out) { entry: %out.addr = alloca i32 addrspace(1)*, align 4 diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll index b71c8bcb76c7..23f40daf3d23 100644 --- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -19,7 +19,7 @@ ; CI: ; NumSgprs: 8 ; VI-NOXNACK: ; NumSgprs: 8 ; VI-XNACK: ; NumSgprs: 12 -define void @no_vcc_no_flat() { +define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{SGPR7}"() ret void @@ -33,7 +33,7 @@ entry: ; CI: ; NumSgprs: 10 ; VI-NOXNACK: ; NumSgprs: 10 ; VI-XNACK: ; NumSgprs: 12 -define void @vcc_no_flat() { +define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC}"() ret void @@ -50,7 +50,7 @@ entry: ; HSA-CI: ; NumSgprs: 8 ; HSA-VI-NOXNACK: ; NumSgprs: 8 ; HSA-VI-XNACK: ; NumSgprs: 12 -define void @no_vcc_flat() { +define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"() ret void @@ -66,7 +66,7 @@ entry: ; HSA-CI: ; NumSgprs: 10 ; HSA-VI-NOXNACK: ; NumSgprs: 10 ; HSA-VI-XNACK: ; NumSgprs: 12 -define void @vcc_flat() { +define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"() ret void diff --git a/test/CodeGen/AMDGPU/flat_atomics.ll b/test/CodeGen/AMDGPU/flat_atomics.ll index 7400dbcf8909..cc95d80570e0 100644 --- a/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/test/CodeGen/AMDGPU/flat_atomics.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_offset: ; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst @@ -13,7 +13,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i32_ret_offset: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst @@ -23,7 +23,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset: ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -34,7 +34,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -45,7 +45,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i32: ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -54,7 +54,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i32_ret: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -63,7 +63,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i32_addr64: ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -73,7 +73,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64: ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -83,7 +83,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32_offset: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst @@ -93,7 +93,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32_ret_offset: ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst @@ -103,7 +103,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -114,7 +114,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset: ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -125,7 +125,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -134,7 +134,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32_ret: ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -143,7 +143,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32_addr64: ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -153,7 +153,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64: ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -163,7 +163,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32_offset: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst @@ -173,7 +173,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst @@ -183,7 +183,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -194,7 +194,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -205,7 +205,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -214,7 +214,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32_ret: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -223,7 +223,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32_addr64: ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -233,7 +233,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64: ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -243,7 +243,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32_offset: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst @@ -253,7 +253,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32_ret_offset: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst @@ -263,7 +263,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -274,7 +274,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -285,7 +285,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -294,7 +294,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32_ret: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -303,7 +303,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32_addr64: ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -313,7 +313,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64: ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -323,7 +323,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32_offset: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst @@ -333,7 +333,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst @@ -343,7 +343,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -354,7 +354,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -365,7 +365,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -374,7 +374,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32_ret: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -383,7 +383,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32_addr64: ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -393,7 +393,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64: ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -403,7 +403,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32_offset: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst @@ -413,7 +413,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32_ret_offset: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst @@ -423,7 +423,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -434,7 +434,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -445,7 +445,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -454,7 +454,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32_ret: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -463,7 +463,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32_addr64: ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -473,7 +473,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64: ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -483,7 +483,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i32_offset: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst @@ -493,7 +493,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset: ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst @@ -503,7 +503,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -514,7 +514,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset: ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -525,16 +525,16 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i32: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst ret void } ; GCN-LABEL: {{^}}atomic_umin_i32_ret: -; GCN: flat_atomic_umin v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -543,7 +543,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i32_addr64: ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -553,7 +553,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64: ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}} - define void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { + define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -563,7 +563,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32_offset: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst @@ -573,7 +573,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32_ret_offset: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst @@ -583,7 +583,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -594,7 +594,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -605,7 +605,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -614,7 +614,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32_ret: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -623,7 +623,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32_addr64: ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -633,7 +633,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64: ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -643,7 +643,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32_offset: ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst @@ -653,7 +653,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst @@ -663,7 +663,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset: ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -674,7 +674,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -685,7 +685,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32: ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -694,7 +694,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32_ret: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -703,7 +703,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64: ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -713,7 +713,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64: ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -725,7 +725,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset: ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst @@ -735,7 +735,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst @@ -746,7 +746,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset: ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -757,7 +757,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -769,7 +769,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32: ; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst ret void @@ -778,7 +778,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 @@ -788,7 +788,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64: ; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst @@ -798,7 +798,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64: ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]] -define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst @@ -809,7 +809,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32_offset: ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst @@ -819,7 +819,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst @@ -829,7 +829,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset: ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -840,7 +840,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -851,7 +851,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32: ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst ret void @@ -860,7 +860,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32_ret: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(4)* %out2 @@ -869,7 +869,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32_addr64: ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -879,7 +879,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64: ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst @@ -890,7 +890,7 @@ entry: ; GCN-LABEL: {{^}}atomic_load_i32_offset: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { entry: %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4 %val = load atomic i32, i32 addrspace(4)* %gep seq_cst, align 4 @@ -901,7 +901,7 @@ entry: ; GCN-LABEL: {{^}}atomic_load_i32: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) { entry: %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4 store i32 %val, i32 addrspace(4)* %out @@ -911,7 +911,7 @@ entry: ; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -923,7 +923,7 @@ entry: ; GCN-LABEL: {{^}}atomic_load_i32_addr64: ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4 @@ -933,7 +933,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i32_offset: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 store atomic i32 %in, i32 addrspace(4)* %gep seq_cst, align 4 @@ -942,7 +942,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i32: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) { entry: store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4 ret void @@ -950,7 +950,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 @@ -960,7 +960,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i32_addr64: ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4 diff --git a/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 0bd6c2dd5b86..723dde9ab68f 100644 --- a/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_offset: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst @@ -13,7 +13,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst @@ -23,7 +23,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -34,7 +34,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -45,7 +45,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -54,7 +54,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_ret: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -63,7 +63,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_addr64: ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -73,7 +73,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64: ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -83,7 +83,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_offset: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst @@ -93,7 +93,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst @@ -103,7 +103,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -114,7 +114,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -125,7 +125,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -134,7 +134,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_ret: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -143,7 +143,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_addr64: ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -153,7 +153,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64: ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -163,7 +163,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_offset: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst @@ -173,7 +173,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst @@ -183,7 +183,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -194,7 +194,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -205,7 +205,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -214,7 +214,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_ret: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -223,7 +223,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_addr64: ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -233,7 +233,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64: ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -243,7 +243,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_offset: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst @@ -253,7 +253,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst @@ -263,7 +263,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -274,7 +274,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -285,7 +285,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -294,7 +294,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_ret: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -303,7 +303,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_addr64: ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -313,7 +313,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64: ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -323,7 +323,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_offset: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst @@ -333,7 +333,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst @@ -343,7 +343,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -354,7 +354,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -365,7 +365,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -374,7 +374,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_ret: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -383,7 +383,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_addr64: ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -393,7 +393,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64: ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -403,7 +403,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_offset: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst @@ -413,7 +413,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst @@ -423,7 +423,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -434,7 +434,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -445,7 +445,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -454,7 +454,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_ret: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -463,7 +463,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_addr64: ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -473,7 +473,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64: ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -483,7 +483,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_offset: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst @@ -493,7 +493,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst @@ -503,7 +503,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -514,7 +514,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -525,7 +525,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -534,7 +534,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_ret: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -543,7 +543,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_addr64: ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -553,7 +553,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64: ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -563,7 +563,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_offset: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst @@ -573,7 +573,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst @@ -583,7 +583,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -594,7 +594,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -605,7 +605,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -614,7 +614,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_ret: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -623,7 +623,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_addr64: ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -633,7 +633,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64: ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -643,7 +643,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_offset: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst @@ -653,7 +653,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst @@ -663,7 +663,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -674,7 +674,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -685,7 +685,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -694,7 +694,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_ret: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -703,7 +703,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64: ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -713,7 +713,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64: ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -723,7 +723,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_offset: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst @@ -733,7 +733,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst @@ -743,7 +743,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -754,7 +754,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -765,7 +765,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst ret void @@ -774,7 +774,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_ret: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(4)* %out2 @@ -783,7 +783,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_addr64: ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -793,7 +793,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64: ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst @@ -804,7 +804,7 @@ entry: ; GCN-LABEL: {{^}}atomic_load_i64_offset: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { entry: %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4 %val = load atomic i64, i64 addrspace(4)* %gep seq_cst, align 8 @@ -815,7 +815,7 @@ entry: ; GCN-LABEL: {{^}}atomic_load_i64: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) { entry: %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8 store i64 %val, i64 addrspace(4)* %out @@ -825,7 +825,7 @@ entry: ; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -837,7 +837,7 @@ entry: ; GCN-LABEL: {{^}}atomic_load_i64_addr64: ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8 @@ -847,7 +847,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i64_offset: ; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -define void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 store atomic i64 %in, i64 addrspace(4)* %gep seq_cst, align 8 @@ -856,7 +856,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i64: ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc -define void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) { +define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) { entry: store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8 ret void @@ -864,7 +864,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset: ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} -define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -874,7 +874,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i64_addr64: ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} -define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8 @@ -883,7 +883,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -892,7 +892,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000 %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -902,7 +902,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -913,7 +913,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -924,7 +924,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4 @@ -936,7 +936,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst ret void @@ -945,7 +945,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 @@ -955,7 +955,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64: ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst @@ -965,7 +965,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64: ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll index 50c5a5abf7fa..4113ba8dc1f0 100644 --- a/test/CodeGen/AMDGPU/fma-combine.ll +++ b/test/CodeGen/AMDGPU/fma-combine.ll @@ -18,7 +18,7 @@ declare float @llvm.fma.f32(float, float, float) #0 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -46,7 +46,7 @@ define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addr ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -75,7 +75,7 @@ define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -99,7 +99,7 @@ define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addr ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -127,7 +127,7 @@ define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -156,7 +156,7 @@ define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, d ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -184,7 +184,7 @@ define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -213,7 +213,7 @@ define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, d ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -242,7 +242,7 @@ define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -276,7 +276,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %o ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI: s_endpgm -define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -317,7 +317,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %o ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -358,7 +358,7 @@ define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias % ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -390,7 +390,7 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias % ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load volatile float, float addrspace(1)* %in1 @@ -406,7 +406,7 @@ define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load volatile float, float addrspace(1)* %in1 @@ -422,7 +422,7 @@ define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -438,7 +438,7 @@ define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -454,7 +454,7 @@ define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -470,7 +470,7 @@ define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -486,7 +486,7 @@ define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -502,7 +502,7 @@ define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -518,7 +518,7 @@ define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -534,7 +534,7 @@ define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] -define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -550,7 +550,7 @@ define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -566,7 +566,7 @@ define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] -define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { %x = load float, float addrspace(1)* %in1 @@ -588,7 +588,7 @@ define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, ; ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]] -define void @test_f32_interp(float addrspace(1)* %out, +define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) { @@ -610,7 +610,7 @@ define void @test_f32_interp(float addrspace(1)* %out, ; ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] -define void @test_f64_interp(double addrspace(1)* %out, +define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll index cf6d7d824992..4d3f3712621e 100644 --- a/test/CodeGen/AMDGPU/fma.f64.ll +++ b/test/CodeGen/AMDGPU/fma.f64.ll @@ -8,7 +8,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) n ; FUNC-LABEL: {{^}}fma_f64: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -21,7 +21,7 @@ define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; FUNC-LABEL: {{^}}fma_v2f64: ; SI: v_fma_f64 ; SI: v_fma_f64 -define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, +define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) { %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 @@ -36,7 +36,7 @@ define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1 ; SI: v_fma_f64 ; SI: v_fma_f64 ; SI: v_fma_f64 -define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, +define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) { %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll index d04a5946b98c..659cecb59ebf 100644 --- a/test/CodeGen/AMDGPU/fma.ll +++ b/test/CodeGen/AMDGPU/fma.ll @@ -12,7 +12,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, ; EG: FMA {{\*? *}}[[RES]] -define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) { %r0 = load float, float addrspace(1)* %in1 %r1 = load float, float addrspace(1)* %in2 @@ -29,7 +29,7 @@ define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}}, ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]] ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]] -define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, +define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) { %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1 %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2 @@ -50,7 +50,7 @@ define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* ; EG-DAG: FMA {{\*? *}}[[RES]].Y ; EG-DAG: FMA {{\*? *}}[[RES]].Z ; EG-DAG: FMA {{\*? *}}[[RES]].W -define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, +define amdgpu_kernel void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) { %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1 %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2 @@ -62,7 +62,7 @@ define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* ; FUNC-LABEL: @fma_commute_mul_inline_imm_f32 ; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}} -define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -77,7 +77,7 @@ define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, fl } ; FUNC-LABEL: @fma_commute_mul_s_f32 -define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind { +define amdgpu_kernel void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll index 4d42a4630e22..8b9104b79e7f 100644 --- a/test/CodeGen/AMDGPU/fmax3.f64.ll +++ b/test/CodeGen/AMDGPU/fmax3.f64.ll @@ -11,7 +11,7 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]] ; SI: buffer_store_dwordx2 [[RESULT]], ; SI: s_endpgm -define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { +define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1 %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2 %a = load volatile double, double addrspace(1)* %aptr, align 8 diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll index 7c01ca85f6b9..a96eb5db9e2a 100644 --- a/test/CodeGen/AMDGPU/fmax3.ll +++ b/test/CodeGen/AMDGPU/fmax3.ll @@ -10,7 +10,7 @@ declare float @llvm.maxnum.f32(float, float) nounwind readnone ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 @@ -28,7 +28,7 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll index da498caa6b54..083346e9d1cb 100644 --- a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; FUNC-LABEL: @test_fmax_legacy_uge_f64 -define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -19,7 +19,7 @@ define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspac } ; FUNC-LABEL: @test_fmax_legacy_oge_f64 -define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -34,7 +34,7 @@ define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspac } ; FUNC-LABEL: @test_fmax_legacy_ugt_f64 -define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -49,7 +49,7 @@ define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspac } ; FUNC-LABEL: @test_fmax_legacy_ogt_f64 -define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll index 4a4c92a38a35..7643c3ea533c 100644 --- a/test/CodeGen/AMDGPU/fmax_legacy.ll +++ b/test/CodeGen/AMDGPU/fmax_legacy.ll @@ -13,7 +13,7 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -33,7 +33,7 @@ define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace( ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -53,7 +53,7 @@ define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace( ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -73,7 +73,7 @@ define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace( ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -93,7 +93,7 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace( ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; EG: MAX -define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1 @@ -114,7 +114,7 @@ define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x flo ; SI-NONAN: v_max_f32_e32 ; SI-NONAN: v_max_f32_e32 ; SI-NONAN: v_max_f32_e32 -define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1 @@ -137,7 +137,7 @@ define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x flo ; SI-NOT: v_max_ ; EG: MAX -define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 diff --git a/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/test/CodeGen/AMDGPU/fmaxnum.f64.ll index fec3a358a4fa..20af278bf98c 100644 --- a/test/CodeGen/AMDGPU/fmaxnum.f64.ll +++ b/test/CodeGen/AMDGPU/fmaxnum.f64.ll @@ -9,7 +9,7 @@ declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0 ; FUNC-LABEL: @test_fmax_f64 ; SI: v_max_f64 -define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind { %val = call double @llvm.maxnum.f64(double %a, double %b) #0 store double %val, double addrspace(1)* %out, align 8 ret void @@ -18,7 +18,7 @@ define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) noun ; FUNC-LABEL: @test_fmax_v2f64 ; SI: v_max_f64 ; SI: v_max_f64 -define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { +define amdgpu_kernel void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0 store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 ret void @@ -29,7 +29,7 @@ define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, < ; SI: v_max_f64 ; SI: v_max_f64 ; SI: v_max_f64 -define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { +define amdgpu_kernel void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0 store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 ret void @@ -44,7 +44,7 @@ define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, < ; SI: v_max_f64 ; SI: v_max_f64 ; SI: v_max_f64 -define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { +define amdgpu_kernel void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0 store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 ret void @@ -67,7 +67,7 @@ define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, < ; SI: v_max_f64 ; SI: v_max_f64 ; SI: v_max_f64 -define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { +define amdgpu_kernel void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0 store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 ret void diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll index 4058247a6da9..277b8ce04c4e 100644 --- a/test/CodeGen/AMDGPU/fmaxnum.ll +++ b/test/CodeGen/AMDGPU/fmaxnum.ll @@ -14,7 +14,7 @@ declare double @llvm.maxnum.f64(double, double) ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind { %val = call float @llvm.maxnum.f32(float %a, float %b) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -27,7 +27,7 @@ define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwin ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] ; EG: MAX_DX10 {{.*}}[[OUT]] ; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { +define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0 store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 ret void @@ -44,7 +44,7 @@ define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 ; EG: MAX_DX10 {{.*}}[[OUT]] ; EG: MAX_DX10 {{.*}}[[OUT]] ; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { +define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0 store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 ret void @@ -70,7 +70,7 @@ define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W -define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { +define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0 store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 ret void @@ -114,7 +114,7 @@ define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W -define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { +define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0 store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 ret void @@ -128,7 +128,7 @@ define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -143,7 +143,7 @@ define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} ; EG: 2143289344(nan) -define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -157,7 +157,7 @@ define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -171,7 +171,7 @@ define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -185,7 +185,7 @@ define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -199,7 +199,7 @@ define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -213,7 +213,7 @@ define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -227,7 +227,7 @@ define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -239,7 +239,7 @@ define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MAX_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -250,7 +250,7 @@ define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -262,7 +262,7 @@ define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -274,7 +274,7 @@ define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0 store float %val, float addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll index e66678069130..d2cfc713ed37 100644 --- a/test/CodeGen/AMDGPU/fmed3.ll +++ b/test/CodeGen/AMDGPU/fmed3.ll @@ -1,18 +1,33 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s + + +; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32: +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}} +; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0 +define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %a.add = fadd nnan float %a, 1.0 + %max = call float @llvm.maxnum.f32(float %a.add, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare float @llvm.minnum.f32(float, float) #0 -declare float @llvm.maxnum.f32(float, float) #0 -declare double @llvm.minnum.f64(double, double) #0 -declare double @llvm.maxnum.f64(double, double) #0 + store float %med, float addrspace(1)* %outgep + ret void +} ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f32: ; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -30,7 +45,7 @@ define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1) ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -48,7 +63,7 @@ define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float add ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -64,7 +79,7 @@ define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float add ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32: ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -81,7 +96,7 @@ define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, flo ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32: ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -98,7 +113,7 @@ define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float ad ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64: ; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0 ; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0 -define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid @@ -113,7 +128,7 @@ define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace( ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32: ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 -define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { +define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -131,7 +146,7 @@ define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addr ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -149,6 +164,812 @@ define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrs ret void } +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %a.fneg = fsub float -0.0, %a + %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod1: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %b.fneg = fsub float -0.0, %b + %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod2: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %c.fneg = fsub float -0.0, %c + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod012: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]| +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + + %a.fneg = fsub float -0.0, %a + %b.fabs = call float @llvm.fabs.f32(float %b) + %c.fabs = call float @llvm.fabs.f32(float %c) + %c.fabs.fneg = fsub float -0.0, %c.fabs + + %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs) + %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_negabs012: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]| +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + + %a.fabs = call float @llvm.fabs.f32(float %a) + %a.fabs.fneg = fsub float -0.0, %a.fabs + %b.fabs = call float @llvm.fabs.f32(float %b) + %b.fabs.fneg = fsub float -0.0, %b.fabs + %c.fabs = call float @llvm.fabs.f32(float %c) + %c.fabs.fneg = fsub float -0.0, %c.fabs + + %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) + %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_nnan_inputs_med3_f32_pat0: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN-DAG: v_add_f32_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]] +; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]] +; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]] +define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + + %a.nnan = fadd nnan float %a, 1.0 + %b.nnan = fadd nnan float %b, 2.0 + %c.nnan = fadd nnan float %c, 4.0 + + %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan) + %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; 16 combinations + +; 0: max(min(x, y), min(max(x, y), z)) +; 1: max(min(x, y), min(max(y, x), z)) +; 2: max(min(x, y), min(z, max(x, y))) +; 3: max(min(x, y), min(z, max(y, x))) +; 4: max(min(y, x), min(max(x, y), z)) +; 5: max(min(y, x), min(max(y, x), z)) +; 6: max(min(y, x), min(z, max(x, y))) +; 7: max(min(y, x), min(z, max(y, x))) +; +; + commute outermost max + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat1: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat2: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat3: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat4: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat5: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat6: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat7: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat8: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat9: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat10: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat11: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat12: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat13: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat14: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat15: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %b, float %a) + %tmp1 = call float @llvm.maxnum.f32(float %b, float %a) + %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1) + %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; --------------------------------------------------------------------- +; Negative patterns +; --------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0: +; GCN-DAG: v_min_f32 +; GCN-DAG: v_max_f32 +; GCN: v_min_f32 +; GCN: v_max_f32 +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + store volatile float %tmp0, float addrspace(1)* undef + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1: +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + store volatile float %tmp1, float addrspace(1)* undef + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2: +define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + store volatile float %tmp2, float addrspace(1)* undef + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + + +; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0: +define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0: +define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + + %a.nnan = fadd float %a, 1.0 + %b.nnan = fadd nnan float %b, 2.0 + %c.nnan = fadd nnan float %c, 4.0 + + %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan) + %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0: +define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + + %a.nnan = fadd nnan float %a, 1.0 + %b.nnan = fadd float %b, 2.0 + %c.nnan = fadd nnan float %c, 4.0 + + %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan) + %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0: +define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + + %a.nnan = fadd nnan float %a, 1.0 + %b.nnan = fadd nnan float %b, 2.0 + %c.nnan = fadd float %c, 4.0 + + %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan) + %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_min_f32 +; GCN: v_max_f32 +; GCN: v_min_f32 +; GCN: v_max_f32 +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %a.fneg = fsub float -0.0, %a + %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; A simple min and max is not sufficient +; GCN-LABEL: {{^}}v_test_global_nnans_min_max_f32: +; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]] +; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]] +define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %max = call float @llvm.maxnum.f32(float %a, float %b) + %minmax = call float @llvm.minnum.f32(float %max, float %c) + store float %minmax, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16: +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 +; SI: v_cvt_f16_f32 + +; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0 +; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0 +; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0 + +; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0 +; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0 +define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %a.add = fadd nnan half %a, 1.0 + %max = call half @llvm.maxnum.f16(half %a.add, half 2.0) + %med = call half @llvm.minnum.f16(half %max, half 4.0) + + store half %med, half addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0: +; GCN: {{buffer_|flat_}}load_ushort [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_ushort [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_ushort [[C:v[0-9]+]] + +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 +; SI: v_add_f32_e32 +; SI: v_add_f32_e32 +; SI: v_med3_f32 +; SI: v_cvt_f16_f32_e32 + + +; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]] +; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]] +; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]] + +; VI-DAG: v_min_f16 +; VI-DAG: v_max_f16 +; VI: v_min_f16 +; VI: v_max_f16 + +; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]] +define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load volatile half, half addrspace(1)* %gep0 + %b = load volatile half, half addrspace(1)* %gep1 + %c = load volatile half, half addrspace(1)* %gep2 + + %a.nnan = fadd nnan half %a, 1.0 + %b.nnan = fadd nnan half %b, 2.0 + %c.nnan = fadd nnan half %c, 4.0 + + %tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan) + %tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan) + %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan) + %med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2) + store half %med3, half addrspace(1)* %outgep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.minnum.f32(float, float) #0 +declare float @llvm.maxnum.f32(float, float) #0 +declare double @llvm.minnum.f64(double, double) #0 +declare double @llvm.maxnum.f64(double, double) #0 +declare half @llvm.fabs.f16(half) #0 +declare half @llvm.minnum.f16(half, half) #0 +declare half @llvm.maxnum.f16(half, half) #0 + attributes #0 = { nounwind readnone } attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll index 3102ffdbdd28..3183f77f090b 100644 --- a/test/CodeGen/AMDGPU/fmin3.ll +++ b/test/CodeGen/AMDGPU/fmin3.ll @@ -11,7 +11,7 @@ declare float @llvm.minnum.f32(float, float) nounwind readnone ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 @@ -29,7 +29,7 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 diff --git a/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll new file mode 100644 index 000000000000..fdfe533b3d0c --- /dev/null +++ b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN %s +; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NONAN -check-prefix=GCN %s + +; FIXME: Should replace unsafe-fp-math with no signed zeros. + +; GCN-LABEL: {{^}}min_fneg_select_regression_0: +; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 +; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, -1.0 +define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 { + %fneg.a = fsub float -0.0, %a + %cmp.a = fcmp ult float %a, 1.0 + %min.a = select i1 %cmp.a, float %fneg.a, float -1.0 + ret float %min.a +} + +; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0: +; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 +; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0 +define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 { + %fneg.a = fsub float -0.0, %a + %cmp.a = fcmp ult float %a, -1.0 + %min.a = select i1 %cmp.a, float %fneg.a, float 1.0 + ret float %min.a +} + +; GCN-LABEL: {{^}}max_fneg_select_regression_0: +; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 +; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0 +define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 { + %fneg.a = fsub float -0.0, %a + %cmp.a = fcmp ugt float %a, 1.0 + %min.a = select i1 %cmp.a, float %fneg.a, float -1.0 + ret float %min.a +} + +; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0: +; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 +; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0 +define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a, float %b) #0 { + %fneg.a = fsub float -0.0, %a + %cmp.a = fcmp ugt float %a, -1.0 + %min.a = select i1 %cmp.a, float %fneg.a, float 1.0 + ret float %min.a +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll index 6982ee0c0cb3..99bc114831ca 100644 --- a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -3,7 +3,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; FUNC-LABEL: @test_fmin_legacy_f64 -define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 { +define amdgpu_kernel void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 { %r0 = extractelement <4 x double> %reg0, i32 0 %r1 = extractelement <4 x double> %reg0, i32 1 %r2 = fcmp uge double %r0, %r1 @@ -14,7 +14,7 @@ define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> } ; FUNC-LABEL: @test_fmin_legacy_ule_f64 -define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -29,7 +29,7 @@ define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspac } ; FUNC-LABEL: @test_fmin_legacy_ole_f64 -define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -44,7 +44,7 @@ define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspac } ; FUNC-LABEL: @test_fmin_legacy_olt_f64 -define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -59,7 +59,7 @@ define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspac } ; FUNC-LABEL: @test_fmin_legacy_ult_f64 -define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll index 79acd02e6d1f..52336f95a909 100644 --- a/test/CodeGen/AMDGPU/fmin_legacy.ll +++ b/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -14,7 +14,7 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; EG: MIN * ; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 { +define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 %r2 = fcmp uge float %r0, %r1 @@ -34,7 +34,7 @@ define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]] -define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 { %cmp = fcmp ule float %a, %b %val = select i1 %cmp, float %a, float %b store float %val, float addrspace(1)* %out, align 4 @@ -46,7 +46,7 @@ define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, floa ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -65,7 +65,7 @@ define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace( ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -84,7 +84,7 @@ define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace( ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -103,7 +103,7 @@ define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace( ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -122,7 +122,7 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace( ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1 @@ -144,7 +144,7 @@ define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x flo ; SI-NONAN: v_min_f32_e32 ; SI-NONAN: v_min_f32_e32 -define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1 @@ -166,7 +166,7 @@ define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x flo ; SI-NONAN: v_min_f32_e32 ; SI-NONAN: v_min_f32_e32 ; SI-NONAN: v_min_f32_e32 -define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1 @@ -188,7 +188,7 @@ define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x flo ; SI-NEXT: v_cndmask_b32 ; SI-NOT: v_min ; SI: s_endpgm -define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 diff --git a/test/CodeGen/AMDGPU/fminnum.f64.ll b/test/CodeGen/AMDGPU/fminnum.f64.ll index 0f929d6a81f0..01b267411212 100644 --- a/test/CodeGen/AMDGPU/fminnum.f64.ll +++ b/test/CodeGen/AMDGPU/fminnum.f64.ll @@ -9,7 +9,7 @@ declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0 ; FUNC-LABEL: @test_fmin_f64 ; SI: v_min_f64 -define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind { %val = call double @llvm.minnum.f64(double %a, double %b) #0 store double %val, double addrspace(1)* %out, align 8 ret void @@ -18,7 +18,7 @@ define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) noun ; FUNC-LABEL: @test_fmin_v2f64 ; SI: v_min_f64 ; SI: v_min_f64 -define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { +define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0 store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 ret void @@ -29,7 +29,7 @@ define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, < ; SI: v_min_f64 ; SI: v_min_f64 ; SI: v_min_f64 -define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { +define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0 store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 ret void @@ -44,7 +44,7 @@ define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, < ; SI: v_min_f64 ; SI: v_min_f64 ; SI: v_min_f64 -define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { +define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0 store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 ret void @@ -67,7 +67,7 @@ define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, < ; SI: v_min_f64 ; SI: v_min_f64 ; SI: v_min_f64 -define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { +define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0 store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 ret void diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll index abd2b9d3e4d1..9e997c7a1045 100644 --- a/test/CodeGen/AMDGPU/fminnum.ll +++ b/test/CodeGen/AMDGPU/fminnum.ll @@ -13,7 +13,7 @@ declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind { %val = call float @llvm.minnum.f32(float %a, float %b) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -26,7 +26,7 @@ define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwin ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] ; EG: MIN_DX10 {{.*}}[[OUT]] ; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { +define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0 store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 ret void @@ -43,7 +43,7 @@ define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 ; EG: MIN_DX10 {{.*}}[[OUT]] ; EG: MIN_DX10 {{.*}}[[OUT]] ; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { +define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0 store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 ret void @@ -69,7 +69,7 @@ define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W -define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { +define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0 store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 ret void @@ -113,7 +113,7 @@ define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W -define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { +define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0 store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 ret void @@ -127,7 +127,7 @@ define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -142,7 +142,7 @@ define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} ; EG: 2143289344({{nan|1\.#QNAN0e\+00}}) -define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -156,7 +156,7 @@ define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -170,7 +170,7 @@ define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -184,7 +184,7 @@ define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -198,7 +198,7 @@ define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -212,7 +212,7 @@ define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -226,7 +226,7 @@ define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG-NOT: MIN_DX10 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -237,7 +237,7 @@ define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.minnum.f32(float %a, float 2.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -248,7 +248,7 @@ define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.minnum.f32(float 2.0, float %a) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -260,7 +260,7 @@ define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.minnum.f32(float %a, float 99.0) #0 store float %val, float addrspace(1)* %out, align 4 ret void @@ -272,7 +272,7 @@ define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { +define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { %val = call float @llvm.minnum.f32(float 99.0, float %a) #0 store float %val, float addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 8663a2129fc0..4002712ab169 100644 --- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -1,5 +1,7 @@ ; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s + ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't ; make add an instruction if the fadd has more than one use. @@ -19,9 +21,9 @@ declare float @llvm.fabs.f32(float) #1 ; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI: v_cndmask_b32_e32 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| -; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0 -define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { +; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 +define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { %a11 = fadd fast float %y, -1.0 %a12 = call float @llvm.fabs.f32(float %a11) %a13 = fadd fast float %x, -1.0 @@ -42,7 +44,7 @@ define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, floa ; GCN-DAG: buffer_store_dword [[MUL2]] ; GCN-DAG: buffer_store_dword [[MAD]] ; GCN: s_endpgm -define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %mul2 = fmul fast float %x, 2.0 %mad = fadd fast float %mul2, %y @@ -57,7 +59,7 @@ define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, floa ; GCN-DAG: buffer_store_dword [[MUL2]] ; GCN-DAG: buffer_store_dword [[MAD]] ; GCN: s_endpgm -define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %x.abs = call float @llvm.fabs.f32(float %x) %mul2 = fmul fast float %x.abs, 2.0 @@ -70,7 +72,7 @@ define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, floa ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32: ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} -define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { +define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %x.abs = call float @llvm.fabs.f32(float %x) %mul2 = fmul fast float %x.abs, 2.0 @@ -85,7 +87,7 @@ define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x ; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] ; GCN: buffer_store_dword [[RESULT]] -define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %mul2 = fmul fast float %x, 2.0 %muln2 = fmul fast float %x, -2.0 @@ -99,7 +101,7 @@ define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 { ; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] ; GCN: buffer_store_dword [[RESULT]] -define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %mul2 = fmul fast float %x, 2.0 %muln2 = fmul fast float %x, -3.0 @@ -114,9 +116,10 @@ define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 { ; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI: v_cndmask_b32_e32 ; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| -; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0 -define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { +; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 +; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 +define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %z = bitcast i16 %z.arg to half @@ -136,11 +139,14 @@ define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x. ; GCN-LABEL: {{^}}multiple_use_fadd_fmac_f16: ; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}} -; GCN-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0 + +; VI-FLUSH-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0 +; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, v{{[0-9]+}} + ; GCN-DAG: buffer_store_short [[MUL2]] ; GCN-DAG: buffer_store_short [[MAD]] ; GCN: s_endpgm -define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { +define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 @@ -153,11 +159,14 @@ define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x. ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f16: ; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| -; GCN-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} + +; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} +; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} + ; GCN-DAG: buffer_store_short [[MUL2]] ; GCN-DAG: buffer_store_short [[MAD]] ; GCN: s_endpgm -define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { +define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 @@ -170,9 +179,13 @@ define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x. } ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f16: -; GCN: v_mad_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} -; GCN: v_mad_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} -define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { +; VI-FLUSH: v_mad_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} +; VI-FLUSH: v_mad_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} + +; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} +; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} + +define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %z = bitcast i16 %z.arg to half @@ -190,7 +203,7 @@ define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroe ; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] ; GCN: buffer_store_short [[RESULT]] -define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { +define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 @@ -206,7 +219,7 @@ define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 ze ; GCN: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] ; GCN: buffer_store_short [[RESULT]] -define void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { +define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll index 4f47d2c8e755..4e96091ae256 100644 --- a/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/test/CodeGen/AMDGPU/fmul.f16.ll @@ -11,7 +11,7 @@ ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fmul_f16( +define amdgpu_kernel void @fmul_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -25,14 +25,13 @@ entry: ; GCN-LABEL: {{^}}fmul_f16_imm_a ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fmul_f16_imm_a( +define amdgpu_kernel void @fmul_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -44,14 +43,14 @@ entry: ; GCN-LABEL: {{^}}fmul_f16_imm_b ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] + ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fmul_f16_imm_b( +define amdgpu_kernel void @fmul_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -61,27 +60,30 @@ entry: ret void } -; GCN-LABEL: {{^}}fmul_v2f16 +; GCN-LABEL: {{^}}fmul_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fmul_v2f16( +define amdgpu_kernel void @fmul_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -93,25 +95,22 @@ entry: ret void } -; GCN-LABEL: {{^}}fmul_v2f16_imm_a +; GCN-LABEL: {{^}}fmul_v2f16_imm_a: ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] +; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fmul_v2f16_imm_a( +define amdgpu_kernel void @fmul_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -121,25 +120,22 @@ entry: ret void } -; GCN-LABEL: {{^}}fmul_v2f16_imm_b +; GCN-LABEL: {{^}}fmul_v2f16_imm_b: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}} -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] -; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fmul_v2f16_imm_b( +define amdgpu_kernel void @fmul_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/fmul.ll b/test/CodeGen/AMDGPU/fmul.ll index d0c39b539456..125de7aabfd4 100644 --- a/test/CodeGen/AMDGPU/fmul.ll +++ b/test/CodeGen/AMDGPU/fmul.ll @@ -6,24 +6,20 @@ ; GCN: v_mul_f32 ; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W -define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float %a, float %b) { entry: %0 = fmul float %a, %b store float %0, float addrspace(1)* %out ret void } -declare float @llvm.r600.load.input(i32) readnone - -declare void @llvm.AMDGPU.store.output(float, i32) - ; FUNC-LABEL: {{^}}fmul_v2f32: ; GCN: v_mul_f32 ; GCN: v_mul_f32 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} -define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { entry: %0 = fmul <2 x float> %a, %b store <2 x float> %0, <2 x float> addrspace(1)* %out @@ -40,7 +36,7 @@ entry: ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -53,7 +49,7 @@ define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1) ; GCN: v_mul_f32 ; GCN-NOT: v_mul_f32 ; GCN: s_endpgm -define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 { %y = fmul float %x, 2.0 %z = fmul float %y, 3.0 store float %z, float addrspace(1)* %out @@ -65,7 +61,7 @@ define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 { ; GCN-NOT: v_mul_f32 ; GCN-NOT: v_mad_f32 ; GCN: s_endpgm -define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { %y = fmul float %x, 3.0 %z = fmul float %y, 2.0 store float %z, float addrspace(1)* %out @@ -79,7 +75,7 @@ define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { ; GCN: v_mul_f32 ; GCN: v_mul_f32 ; GCN-NOT: v_mul_f32 -define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 { +define amdgpu_kernel void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 { %a = fmul float %x, 5.0 %b = fsub float -0.0, %a %c = fmul float %b, %y diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll index 3c222eaba89d..f14233f267b2 100644 --- a/test/CodeGen/AMDGPU/fmul64.ll +++ b/test/CodeGen/AMDGPU/fmul64.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}fmul_f64: ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -15,7 +15,7 @@ define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; FUNC-LABEL: {{^}}fmul_v2f64: ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, +define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2) { %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 @@ -29,7 +29,7 @@ define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace( ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, +define amdgpu_kernel void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, <4 x double> addrspace(1)* %in2) { %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 diff --git a/test/CodeGen/AMDGPU/fmuladd.f16.ll b/test/CodeGen/AMDGPU/fmuladd.f16.ll index 500b00bdcf87..9b713419e747 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare half @llvm.fmuladd.f16(half, half, half) #1 @@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #1 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, +define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { %r0 = load half, half addrspace(1)* %in1 %r1 = load half, half addrspace(1)* %in2 @@ -34,7 +34,7 @@ define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -56,7 +56,7 @@ define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -82,7 +82,7 @@ define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_a_a_b_f16(half addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -111,7 +111,7 @@ define void @fadd_a_a_b_f16(half addrspace(1)* %out, ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_b_a_a_f16(half addrspace(1)* %out, +define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -134,7 +134,7 @@ define void @fadd_b_a_a_f16(half addrspace(1)* %out, ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] ; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -156,7 +156,7 @@ define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -180,7 +180,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -202,7 +202,7 @@ define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -231,7 +231,7 @@ define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -261,7 +261,7 @@ define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspa ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -291,7 +291,7 @@ define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half add ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -323,7 +323,7 @@ define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half ad ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -355,7 +355,7 @@ define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, hal ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -388,7 +388,7 @@ define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half add ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext @@ -419,7 +419,7 @@ define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half ad ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 @@ -447,7 +447,7 @@ define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll index e4b1053ff25c..fb605dd2e4bd 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -25,7 +25,7 @@ declare float @llvm.fabs.f32(float) #1 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { %r0 = load float, float addrspace(1)* %in1 %r1 = load float, float addrspace(1)* %in2 @@ -45,7 +45,7 @@ define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, ; GCN-DENORM-STRICT: v_mul_f32_e32 ; GCN-DENORM-STRICT: v_add_f32_e32 -define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { %r0 = load volatile float, float addrspace(1)* %in1 %r1 = load volatile float, float addrspace(1)* %in2 @@ -71,7 +71,7 @@ define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, ; SI-DENORM buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -100,7 +100,7 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* % ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -132,7 +132,7 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* % ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_a_a_b_f32(float addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -167,7 +167,7 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out, ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_b_a_a_f32(float addrspace(1)* %out, +define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -196,7 +196,7 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out, ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -225,7 +225,7 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1 ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -256,7 +256,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -286,7 +286,7 @@ define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1 ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -318,7 +318,7 @@ define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1 ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -353,7 +353,7 @@ define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrs ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -387,7 +387,7 @@ define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float a ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -422,7 +422,7 @@ define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -460,7 +460,7 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -496,7 +496,7 @@ define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float a ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { +define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext @@ -532,7 +532,7 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -563,7 +563,7 @@ define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* % ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll index f5e64b3c5941..86e91e04b0fc 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f64.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -7,7 +7,7 @@ ; GCN-LABEL: {{^}}fmuladd_f64: ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -22,7 +22,7 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -44,7 +44,7 @@ define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; SI: buffer_store_dwordx2 [[RESULT]] ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_a_a_b_f64(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -72,7 +72,7 @@ define void @fadd_a_a_b_f64(double addrspace(1)* %out, ; SI: buffer_store_dwordx2 [[RESULT]] ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fadd_b_a_a_f64(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_b_a_a_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -94,7 +94,7 @@ define void @fadd_b_a_a_f64(double addrspace(1)* %out, ; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} ; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} -define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { +define amdgpu_kernel void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext @@ -117,7 +117,7 @@ define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double add ; GCN-STRICT: v_add_f64 ; GCN-CONTRACT: v_fma_f64 -define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -139,7 +139,7 @@ define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out, ; GCN-STRICT: v_add_f64 ; GCN-CONTRACT: v_fma_f64 -define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -158,7 +158,7 @@ define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out, ; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast: ; GCN: v_fma_f64 -define void @fadd_a_a_b_f64_fast(double addrspace(1)* %out, +define amdgpu_kernel void @fadd_a_a_b_f64_fast(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll new file mode 100644 index 000000000000..bdd3c04fd318 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -0,0 +1,107 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s + +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 + +; GCN-LABEL: {{^}}fmuladd_v2f16: +; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, + <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { + %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 + %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 + %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3 + %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) + store <2 x half> %r3, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] +; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] + +; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + + %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0 + %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1 + + %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> , <2 x half> %r1, <2 x half> %r2) + store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_a_2.0_b_v2f16: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] +; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] + +; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + + %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0 + %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1 + + %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r1, <2 x half> , <2 x half> %r2) + store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_v2f16: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] +; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] + +; GFX9-DENORM-STRICT: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] +; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] + +; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out, + <2 x half> addrspace(1)* %in1, + <2 x half> addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + + %r0 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0 + %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1 + + %add.0 = fadd <2 x half> %r0, %r0 + %add.1 = fadd <2 x half> %add.0, %r1 + store <2 x half> %add.1, <2 x half> addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fnearbyint.ll b/test/CodeGen/AMDGPU/fnearbyint.ll index 5423fadf81e2..4ff3bbbcbc3e 100644 --- a/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/test/CodeGen/AMDGPU/fnearbyint.ll @@ -13,41 +13,41 @@ declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 -define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 { entry: %0 = call float @llvm.nearbyint.f32(float %in) store float %0, float addrspace(1)* %out ret void } -define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +define amdgpu_kernel void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) store <2 x float> %0, <2 x float> addrspace(1)* %out ret void } -define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { +define amdgpu_kernel void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { entry: %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) store <4 x float> %0, <4 x float> addrspace(1)* %out ret void } -define void @nearbyint_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @nearbyint_f64(double addrspace(1)* %out, double %in) { entry: %0 = call double @llvm.nearbyint.f64(double %in) store double %0, double addrspace(1)* %out ret void } -define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { entry: %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) store <2 x double> %0, <2 x double> addrspace(1)* %out ret void } -define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { entry: %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) store <4 x double> %0, <4 x double> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll index 3f9928c2b623..1c0e9a2f13ce 100644 --- a/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -enable-unsafe-fp-math -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s ; -------------------------------------------------------------------------------- ; fadd tests @@ -14,7 +14,7 @@ ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -35,7 +35,7 @@ define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -53,12 +53,16 @@ define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrsp ; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] -; GCN-NEXT: buffer_store_dword [[NEG_ADD]] + +; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] +; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] + +; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]] +; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[ADD]] +; GCN: buffer_store_dword [[NEG_ADD]] ; GCN-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -83,7 +87,7 @@ define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrsp ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -107,7 +111,7 @@ define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -131,7 +135,7 @@ define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -160,7 +164,7 @@ define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace( ; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]] -define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -188,7 +192,7 @@ define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float add ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -214,7 +218,7 @@ define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float add ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] ; GCN-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -235,7 +239,7 @@ define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[ADD]] -define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -253,12 +257,11 @@ define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrsp ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] -; GCN-NEXT: buffer_store_dword [[NEG_MUL]] -; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]] +; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] +; GCN-NEXT: buffer_store_dword [[MUL0]] +; GCN-NEXT: buffer_store_dword [[MUL1]] +define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -279,7 +282,7 @@ define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrsp ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -299,7 +302,7 @@ define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -319,7 +322,7 @@ define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -342,7 +345,7 @@ define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace( ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[NEG_A]] -define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -365,7 +368,7 @@ define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float add ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -382,6 +385,300 @@ define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float add ret void } +; -------------------------------------------------------------------------------- +; fminnum tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_minnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %min = call float @llvm.minnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %min + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_self_minnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %min = call float @llvm.minnum.f32(float %a, float %a) + %min.fneg = fsub float -0.0, %min + store float %min.fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0 +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %min = call float @llvm.minnum.f32(float 4.0, float %a) + %fneg = fsub float -0.000000e+00, %min + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0 +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %min = call float @llvm.minnum.f32(float -4.0, float %a) + %fneg = fsub float -0.000000e+00, %min + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_0_minnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %min = call float @llvm.minnum.f32(float 0.0, float %a) + %fneg = fsub float -0.000000e+00, %min + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0 +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %min = call float @llvm.minnum.f32(float -0.0, float %a) + %fneg = fsub float -0.000000e+00, %min + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]] +; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %min = call float @llvm.minnum.f32(float 0.0, float %a) + %fneg = fsub float -0.000000e+00, %min + %mul = fmul float %fneg, %b + store float %mul, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]] +; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] +; GCN-NEXT: buffer_store_dword [[MAX0]] +; GCN-NEXT: buffer_store_dword [[MUL1]] +define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %min = call float @llvm.minnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %min + %use1 = fmul float %min, 4.0 + store volatile float %fneg, float addrspace(1)* %out + store volatile float %use1, float addrspace(1)* %out + ret void +} + +; -------------------------------------------------------------------------------- +; fmaxnum tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_maxnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %min = call float @llvm.maxnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %min + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %min = call float @llvm.maxnum.f32(float %a, float %a) + %min.fneg = fsub float -0.0, %min + store float %min.fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0 +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %min = call float @llvm.maxnum.f32(float 4.0, float %a) + %fneg = fsub float -0.000000e+00, %min + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0 +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %min = call float @llvm.maxnum.f32(float -4.0, float %a) + %fneg = fsub float -0.000000e+00, %min + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %max = call float @llvm.maxnum.f32(float 0.0, float %a) + %fneg = fsub float -0.000000e+00, %max + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0 +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %max = call float @llvm.maxnum.f32(float -0.0, float %a) + %fneg = fsub float -0.000000e+00, %max + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] +; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %max = call float @llvm.maxnum.f32(float 0.0, float %a) + %fneg = fsub float -0.000000e+00, %max + %mul = fmul float %fneg, %b + store float %mul, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]] +; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] +; GCN-NEXT: buffer_store_dword [[MAX0]] +; GCN-NEXT: buffer_store_dword [[MUL1]] +define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %min = call float @llvm.maxnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %min + %use1 = fmul float %min, 4.0 + store volatile float %fneg, float addrspace(1)* %out + store volatile float %use1, float addrspace(1)* %out + ret void +} + ; -------------------------------------------------------------------------------- ; fma tests ; -------------------------------------------------------------------------------- @@ -396,7 +693,7 @@ define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float add ; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -420,7 +717,7 @@ define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] ; GCN-NEXT: buffer_store_dword [[NEG_FMA]] ; GCN-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -441,12 +738,17 @@ define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrsp ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] -; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]] + +; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] +; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] +; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]] + +; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] +; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]] + ; GCN-NEXT: buffer_store_dword [[NEG_FMA]] ; GCN-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -474,7 +776,7 @@ define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrsp ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -501,7 +803,7 @@ define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -528,7 +830,7 @@ define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -556,7 +858,7 @@ define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspac ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -584,7 +886,7 @@ define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspac ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] -define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -614,7 +916,7 @@ define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1 ; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]] -define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -644,7 +946,7 @@ define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float a ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]] ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 { +define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -677,7 +979,7 @@ define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float a ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -697,12 +999,17 @@ define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.pt ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]] -; GCN-DAG: v_xor_b32_e32 [[NEG_C:v[0-9]+]], 0x80000000, [[C]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] -; GCN-NEXT: buffer_store_dword [[NEG_C]] + +; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]] +; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]] +; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] + +; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]] +; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]] + +; GCN: buffer_store_dword [[NEG_MAD]] ; GCN-NEXT: buffer_store_dword [[MUL]] -define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { +define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -728,7 +1035,7 @@ define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addr ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]] ; GCN: buffer_store_dwordx2 [[RESULT]] -define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -744,7 +1051,7 @@ define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrsp ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] ; GCN: buffer_store_dwordx2 [[RESULT]] -define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -763,7 +1070,7 @@ define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float a ; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN: buffer_store_dwordx2 [[RESULT]] ; GCN: buffer_store_dword [[FNEG_A]] -define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -783,7 +1090,7 @@ define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %ou ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}} ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}} -define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -803,7 +1110,7 @@ define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %ou ; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}} ; GCN: buffer_store_dwordx2 [[MUL]] -define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -819,7 +1126,7 @@ define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspac ; FIXME: Source modifiers not folded for f16->f32 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32: -define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext @@ -833,7 +1140,7 @@ define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out } ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32: -define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext @@ -855,7 +1162,7 @@ define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] ; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -871,7 +1178,7 @@ define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspa ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] ; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -888,10 +1195,9 @@ define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double ad ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}} ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]] -; GCN-DAG: v_mov_b32_e32 v[[NEG_A_LO:[0-9]+]], v[[A_LO]] ; GCN: buffer_store_dword [[RESULT]] -; GCN: buffer_store_dwordx2 v{{\[}}[[NEG_A_LO]]:[[NEG_A_HI]]{{\]}} -define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { +; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}} +define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -911,7 +1217,7 @@ define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, ; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}} ; GCN: buffer_store_dword [[RESULT]] ; GCN: buffer_store_dwordx2 [[USE1]] -define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 { +define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -930,7 +1236,7 @@ define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_short [[RESULT]] -define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -946,7 +1252,7 @@ define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] ; GCN: buffer_store_short [[RESULT]] -define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -965,7 +1271,7 @@ define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addr ; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]] ; GCN: buffer_store_dword [[NEG]] ; GCN: buffer_store_dword [[CVT]] -define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext @@ -984,7 +1290,7 @@ define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN: buffer_store_short [[RESULT]] ; GCN: buffer_store_dword [[NEG_A]] -define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1004,7 +1310,7 @@ define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, ; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s ; GCN: buffer_store_short [[RESULT]] ; GCN: buffer_store_dword [[USE1]] -define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1027,7 +1333,7 @@ define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1043,7 +1349,7 @@ define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1062,7 +1368,7 @@ define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* % ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN: buffer_store_dword [[RESULT]] ; GCN: buffer_store_dword [[NEG_A]] -define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1082,7 +1388,7 @@ define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrs ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1105,7 +1411,7 @@ define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrs ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1126,7 +1432,7 @@ define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] ; GCN-NEXT: buffer_store_dword [[RESULT]] -define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1147,7 +1453,7 @@ define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[ADD]] -define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1170,7 +1476,7 @@ define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out ; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1191,7 +1497,7 @@ define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1211,7 +1517,7 @@ define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrsp ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1231,7 +1537,7 @@ define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrsp ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] -define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1254,7 +1560,7 @@ define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float add ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[NEG_A]] -define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1277,7 +1583,7 @@ define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, fl ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[MUL]] -define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { +define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1300,12 +1606,11 @@ define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, fl ; GCN-LABEL: {{^}}v_fneg_sin_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 -; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[K]], -[[A]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]] ; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]] ; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -1321,27 +1626,509 @@ define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]] ; GCN: buffer_store_dword [[RESULT]] -define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %sin = call float @llvm.amdgcn.sin.f32(float %a) - %fneg = fsub float -0.000000e+00, %sin + %fneg = fsub float -0.0, %sin store float %fneg, float addrspace(1)* %out.gep ret void } +; -------------------------------------------------------------------------------- +; ftrunc tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_trunc_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %trunc = call float @llvm.trunc.f32(float %a) + %fneg = fsub float -0.0, %trunc + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; fround tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_round_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_trunc_f32_e32 +; GCN: v_subrev_f32_e32 +; GCN: v_cndmask_b32 + +; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]] + +; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}} +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %round = call float @llvm.round.f32(float %a) + %fneg = fsub float -0.0, %round + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; rint tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_rint_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %rint = call float @llvm.rint.f32(float %a) + %fneg = fsub float -0.0, %rint + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; nearbyint tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_nearbyint_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %nearbyint = call float @llvm.nearbyint.f32(float %a) + %fneg = fsub float -0.0, %nearbyint + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; vintrp tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_interp_p1_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] +; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]] +; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]] +define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0) + %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0) + store volatile float %intrp0, float addrspace(1)* %out.gep + store volatile float %intrp1, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_interp_p2_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] +; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]] +; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]] +define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0) + %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0) + store volatile float %intrp0, float addrspace(1)* %out.gep + store volatile float %intrp1, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; CopyToReg tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_copytoreg_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] +; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]] +; GCN: s_cbranch_scc1 + +; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]] +; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]] +; GCN: buffer_store_dword [[MUL1]] + +; GCN: buffer_store_dword [[MUL0]] +define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + %cmp0 = icmp eq i32 %d, 0 + br i1 %cmp0, label %if, label %endif + +if: + %mul1 = fmul float %fneg, %c + store volatile float %mul1, float addrspace(1)* %out.gep + br label %endif + +endif: + store volatile float %mul, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; inlineasm tests +; -------------------------------------------------------------------------------- + +; Can't fold into use, so should fold into source +; GCN-LABEL: {{^}}v_fneg_inlineasm_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] +; GCN: ; use [[MUL]] +; GCN: buffer_store_dword [[MUL]] +define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + call void asm sideeffect "; use $0", "v"(float %fneg) #0 + store volatile float %fneg, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; inlineasm tests +; -------------------------------------------------------------------------------- + +; Can't fold into use, so should fold into source +; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]] +; GCN: ; use [[NEG]] +; GCN: buffer_store_dword [[MUL]] +define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + call void asm sideeffect "; use $0", "v"(float %fneg) #0 + store volatile float %mul, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; code size regression tests +; -------------------------------------------------------------------------------- + +; There are multiple users of the fneg that must use a VOP3 +; instruction, so there is no penalty +; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] + +; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]] +; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0 +; GCN-NEXT: buffer_store_dword [[FMA0]] +; GCN-NEXT: buffer_store_dword [[FMA1]] +define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + + %fneg.a = fsub float -0.0, %a + %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) + %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0) + + store volatile float %fma0, float addrspace(1)* %out + store volatile float %fma1, float addrspace(1)* %out + ret void +} + +; There are multiple users, but both require using a larger encoding +; for the modifier. + +; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] + +; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]] +; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] +; GCN-NEXT: buffer_store_dword [[MUL0]] +; GCN-NEXT: buffer_store_dword [[MUL1]] +define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + + %fneg.a = fsub float -0.0, %a + %mul0 = fmul float %fneg.a, %b + %mul1 = fmul float %fneg.a, %c + + store volatile float %mul0, float addrspace(1)* %out + store volatile float %mul1, float addrspace(1)* %out + ret void +} + +; One user is VOP3 so has no cost to folding the modifier, the other does. +; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] + +; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0 +; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] + +; GCN: buffer_store_dword [[FMA0]] +; GCN-NEXT: buffer_store_dword [[MUL1]] +define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + + %fneg.a = fsub float -0.0, %a + %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0) + %mul1 = fmul float %fneg.a, %c + + store volatile float %fma0, float addrspace(1)* %out + store volatile float %mul1, float addrspace(1)* %out + ret void +} + +; The use of the fneg requires a code size increase, but folding into +; the source does not + +; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] + +; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0 +; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]] +; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]] + +; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0 +; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]] +; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]] + +; GCN: buffer_store_dword [[MUL1]] +; GCN-NEXT: buffer_store_dword [[MUL2]] +define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %d = load volatile float, float addrspace(1)* %d.gep + + %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0) + %fneg.fma0 = fsub float -0.0, %fma0 + %mul1 = fmul float %fneg.fma0, %c + %mul2 = fmul float %fneg.fma0, %d + + store volatile float %mul1, float addrspace(1)* %out + store volatile float %mul2, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] +; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] +; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] + +; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 +; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]] +; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]] + +; GCN: buffer_store_dwordx2 [[MUL0]] +; GCN: buffer_store_dwordx2 [[MUL1]] +define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext + %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext + %a = load volatile double, double addrspace(1)* %a.gep + %b = load volatile double, double addrspace(1)* %b.gep + %c = load volatile double, double addrspace(1)* %c.gep + %d = load volatile double, double addrspace(1)* %d.gep + + %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0) + %fneg.fma0 = fsub double -0.0, %fma0 + %mul1 = fmul double %fneg.fma0, %c + %mul2 = fmul double %fneg.fma0, %d + + store volatile double %mul1, double addrspace(1)* %out + store volatile double %mul2, double addrspace(1)* %out + ret void +} + +; %trunc.a has one fneg use, but it requires a code size increase and +; %the fneg can instead be folded for free into the fma. + +; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] +; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] +; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] +; GCN: buffer_store_dword [[FMA0]] +define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %d = load volatile float, float addrspace(1)* %d.gep + + %trunc.a = call float @llvm.trunc.f32(float %a) + %trunc.fneg.a = fsub float -0.0, %trunc.a + %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) + store volatile float %fma0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] +; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] +; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] +; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]] +; GCN: buffer_store_dword [[FMA0]] +; GCN: buffer_store_dword [[MUL1]] +define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %d = load volatile float, float addrspace(1)* %d.gep + + %trunc.a = call float @llvm.trunc.f32(float %a) + %trunc.fneg.a = fsub float -0.0, %trunc.a + %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) + %mul1 = fmul float %trunc.a, %d + store volatile float %fma0, float addrspace(1)* %out + store volatile float %mul1, float addrspace(1)* %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fma.f32(float, float, float) #1 declare float @llvm.fmuladd.f32(float, float, float) #1 declare float @llvm.sin.f32(float) #1 +declare float @llvm.trunc.f32(float) #1 +declare float @llvm.round.f32(float) #1 +declare float @llvm.rint.f32(float) #1 +declare float @llvm.nearbyint.f32(float) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 + +declare double @llvm.fma.f64(double, double, double) #1 declare float @llvm.amdgcn.sin.f32(float) #1 declare float @llvm.amdgcn.rcp.f32(float) #1 declare float @llvm.amdgcn.rcp.legacy(float) #1 declare float @llvm.amdgcn.fmul.legacy(float, float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index d7d21311c1b9..555764c15519 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -1,33 +1,35 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN -check-prefix=CIVI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: ; CI: v_cvt_f32_f16_e32 -; CI: v_cvt_f32_f16_e32 -; CI: v_sub_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |v{{[0-9]+}}| +; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}| +; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}} -; VI-NOT: and -; VI: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| -define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) { +; GFX89-NOT: _and +; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| +define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs %fadd = fadd half %y, %fsub store half %fadd, half addrspace(1)* %out, align 2 ret void } ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16: -; CI: v_cvt_f32_f16_e32 -; CI: v_cvt_f32_f16_e32 -; CI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}| +; CI-DAG: v_cvt_f32_f16_e32 +; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}| +; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}} ; CI: v_cvt_f16_f32_e32 -; VI-NOT: and -; VI: v_mul_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}| -; VI-NOT: and -define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { +; GFX89-NOT: _and +; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}| +; GFX89-NOT: [[MUL]] +; GFX89: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] +define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs %fmul = fmul half %y, %fsub store half %fmul, half addrspace(1)* %out, align 2 ret void @@ -39,75 +41,113 @@ define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { ; GCN-LABEL: {{^}}fneg_fabs_free_f16: ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} -define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { +define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc = bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs store half %fsub, half addrspace(1)* %out ret void } -; FIXME: Should use or ; GCN-LABEL: {{^}}fneg_fabs_f16: -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} -define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { +; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs store half %fsub, half addrspace(1)* %out, align 2 ret void } ; GCN-LABEL: {{^}}v_fneg_fabs_f16: -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} -define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %val = load half, half addrspace(1)* %in, align 2 %fabs = call half @llvm.fabs.f16(half %val) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs store half %fsub, half addrspace(1)* %out, align 2 ret void } ; FIXME: single bit op -; GCN-LABEL: {{^}}fneg_fabs_v2f16: -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: flat_store_dword -define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { +; GCN-LABEL: {{^}}s_fneg_fabs_v2f16: +; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: flat_store_dword + +; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}} +define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) - %fsub = fsub <2 x half> , %fabs - store <2 x half> %fsub, <2 x half> addrspace(1)* %out + %fneg.fabs = fsub <2 x half> , %fabs + store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: flat_store_dwordx2 -define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { +; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], + +; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 +; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} +; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} + +; GCN: flat_store_dwordx2 +define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) - %fsub = fsub <4 x half> , %fabs + %fsub = fsub <4 x half> , %fabs store <4 x half> %fsub, <4 x half> addrspace(1)* %out ret void } -declare half @llvm.fabs.f16(half) readnone -declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone -declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone +; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16: +; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| +; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| +; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} + +; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0 +; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0 + +; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0] +define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) + %fneg.fabs = fsub <2 x half> , %fabs + %mul = fmul <2 x half> %fneg.fabs, + store <2 x half> %mul, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_v2f16: +; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff +; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]] +; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]] +define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) { + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) + %fneg = fsub <2 x half> , %fabs + store <2 x half> %fabs, <2 x half> addrspace(1)* %out0 + store <2 x half> %fneg, <2 x half> addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16: +; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0] +define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) { + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) + %fneg = fsub <2 x half> , %fabs + %mul = fmul <2 x half> %fneg, + store <2 x half> %fabs, <2 x half> addrspace(1)* %out0 + store <2 x half> %mul, <2 x half> addrspace(1)* %out1 + ret void +} + +declare half @llvm.fabs.f16(half) #1 +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index d16e83fd4d5b..85f544032171 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}fneg_fabs_fadd_f64: ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}} -define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) { +define amdgpu_kernel void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) { %fabs = call double @llvm.fabs.f64(double %x) %fsub = fsub double -0.000000e+00, %fabs %fadd = fadd double %y, %fsub @@ -14,7 +14,7 @@ define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) ret void } -define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) { +define amdgpu_kernel void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) { %x = load double, double addrspace(1)* %xptr, align 8 %y = load double, double addrspace(1)* %xptr, align 8 %fabs = call double @llvm.fabs.f64(double %x) @@ -26,7 +26,7 @@ define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1) ; GCN-LABEL: {{^}}fneg_fabs_fmul_f64: ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}} -define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) { +define amdgpu_kernel void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) { %fabs = call double @llvm.fabs.f64(double %x) %fsub = fsub double -0.000000e+00, %fabs %fmul = fmul double %y, %fsub @@ -35,7 +35,7 @@ define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) } ; GCN-LABEL: {{^}}fneg_fabs_free_f64: -define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) { %bc = bitcast i64 %in to double %fabs = call double @llvm.fabs.f64(double %bc) %fsub = fsub double -0.000000e+00, %fabs @@ -46,7 +46,7 @@ define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) { ; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64: ; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}} ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { %bc = bitcast i64 %in to double %fabs = call double @fabs(double %bc) %fsub = fsub double -0.000000e+00, %fabs @@ -62,7 +62,7 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]] ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}} -define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { %fabs = call double @llvm.fabs.f64(double %in) %fsub = fsub double -0.000000e+00, %fabs store double %fsub, double addrspace(1)* %out, align 8 @@ -74,7 +74,7 @@ define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { ; GCN-NOT: 0x80000000 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) %fsub = fsub <2 x double> , %fabs store <2 x double> %fsub, <2 x double> addrspace(1)* %out @@ -88,7 +88,7 @@ define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) %fsub = fsub <4 x double> , %fabs store <4 x double> %fsub, <4 x double> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll index 9ee1171306c7..a0cf37b159db 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and ; SI: v_subrev_f32_e64 {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}} -define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { +define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs %fadd = fadd float %y, %fsub @@ -17,7 +17,7 @@ define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { ; SI-NOT: and ; SI: v_mul_f32_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|, {{s[0-9]+}} ; SI-NOT: and -define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { +define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs %fmul = fmul float %y, %fsub @@ -35,7 +35,7 @@ define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { ; R600: -PV ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) %fsub = fsub float -0.000000e+00, %fabs @@ -49,7 +49,7 @@ define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { ; R600: -PV ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @fabs(float %bc) %fsub = fsub float -0.000000e+00, %fabs @@ -59,7 +59,7 @@ define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { ; FUNC-LABEL: {{^}}fneg_fabs_f32: ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) %fsub = fsub float -0.000000e+00, %fabs store float %fsub, float addrspace(1)* %out, align 4 @@ -68,7 +68,7 @@ define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { ; FUNC-LABEL: {{^}}v_fneg_fabs_f32: ; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %val = load float, float addrspace(1)* %in, align 4 %fabs = call float @llvm.fabs.f32(float %val) %fsub = fsub float -0.000000e+00, %fabs @@ -86,7 +86,7 @@ define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs store <2 x float> %fsub, <2 x float> addrspace(1)* %out @@ -99,7 +99,7 @@ define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs store <4 x float> %fsub, <4 x float> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll index e3dfd9201a24..626a0b50cce8 100644 --- a/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s ; FIXME: Should be able to do scalar op -; FUNC-LABEL: {{^}}s_fneg_f16: - -define void @s_fneg_f16(half addrspace(1)* %out, half %in) { - %fneg = fsub half -0.000000e+00, %in +; GCN-LABEL: {{^}}s_fneg_f16: +define amdgpu_kernel void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 { + %fneg = fsub half -0.0, %in store half %fneg, half addrspace(1)* %out ret void } @@ -13,49 +13,123 @@ define void @s_fneg_f16(half addrspace(1)* %out, half %in) { ; FIXME: Should be able to use bit operations when illegal type as ; well. -; FUNC-LABEL: {{^}}v_fneg_f16: +; GCN-LABEL: {{^}}v_fneg_f16: ; GCN: flat_load_ushort [[VAL:v[0-9]+]], - -; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]] -; CI: v_cvt_f16_f32_e64 [[CVT1:v[0-9]+]], -[[CVT0]] -; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] - -; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] +; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] -define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) { - %val = load half, half addrspace(1)* %in, align 2 - %fneg = fsub half -0.000000e+00, %val - store half %fneg, half addrspace(1)* %out +; SI: buffer_store_short [[XOR]] +define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid + %val = load half, half addrspace(1)* %gep.in, align 2 + %fneg = fsub half -0.0, %val + store half %fneg, half addrspace(1)* %gep.out ret void } -; FUNC-LABEL: {{^}}fneg_free_f16: +; GCN-LABEL: {{^}}fneg_free_f16: ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]], ; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} ; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] -define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) { +define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { %bc = bitcast i16 %in to half %fsub = fsub half -0.0, %bc store half %fsub, half addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}v_fneg_fold_f16: +; GCN-LABEL: {{^}}v_fneg_fold_f16: ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]] -; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[CVT0]] -; CI: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[CVT0]], [[CVT0]] +; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]] +; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]] +; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]] ; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] ; VI-NOT: [[NEG_VALUE]] ; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in %fsub = fsub half -0.0, %val %fmul = fmul half %fsub, %val store half %fmul, half addrspace(1)* %out ret void } + +; FIXME: Terrible code with VI and even worse with SI/CI +; GCN-LABEL: {{^}}s_fneg_v2f16: +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_or_b32_e32 + +; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}} +; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] + +; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} + +define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { + %fneg = fsub <2 x half> , %in + store <2 x half> %fneg, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]] +define amdgpu_kernel void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fneg = fsub <2 x half> , %val + store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fneg_free_v2f16: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; CIVI: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000 + +; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]] +define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { + %bc = bitcast i32 %in to <2 x half> + %fsub = fsub <2 x half> , %bc + store <2 x half> %fsub, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] + +; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}} +; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}} +; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_f16_f32 +; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_f16_f32 + +; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}} +define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fsub = fsub <2 x half> , %val + %fmul = fmul <2 x half> %fsub, %val + store <2 x half> %fmul, <2 x half> addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fneg.f64.ll b/test/CodeGen/AMDGPU/fneg.f64.ll index b7080f4622a3..9b4b4d6e942a 100644 --- a/test/CodeGen/AMDGPU/fneg.f64.ll +++ b/test/CodeGen/AMDGPU/fneg.f64.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}fneg_f64: ; GCN: v_xor_b32 -define void @fneg_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double %in) { %fneg = fsub double -0.000000e+00, %in store double %fneg, double addrspace(1)* %out ret void @@ -12,7 +12,7 @@ define void @fneg_f64(double addrspace(1)* %out, double %in) { ; FUNC-LABEL: {{^}}fneg_v2f64: ; GCN: v_xor_b32 ; GCN: v_xor_b32 -define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) { +define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) { %fneg = fsub <2 x double> , %in store <2 x double> %fneg, <2 x double> addrspace(1)* %out ret void @@ -28,7 +28,7 @@ define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> ; GCN: v_xor_b32 ; GCN: v_xor_b32 ; GCN: v_xor_b32 -define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) { +define amdgpu_kernel void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) { %fneg = fsub <4 x double> , %in store <4 x double> %fneg, <4 x double> addrspace(1)* %out ret void @@ -40,7 +40,7 @@ define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> ; FUNC-LABEL: {{^}}fneg_free_f64: ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{s\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { %bc = bitcast i64 %in to double %fsub = fsub double 0.0, %bc store double %fsub, double addrspace(1)* %out @@ -52,7 +52,7 @@ define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { ; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-NOT: xor ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @fneg_fold_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) { %fsub = fsub double -0.0, %in %fmul = fmul double %fsub, %in store double %fmul, double addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fneg.ll b/test/CodeGen/AMDGPU/fneg.ll index 007c6dcadd9e..d1eabfb13c9a 100644 --- a/test/CodeGen/AMDGPU/fneg.ll +++ b/test/CodeGen/AMDGPU/fneg.ll @@ -6,7 +6,7 @@ ; R600: -PV ; GCN: v_xor_b32 -define void @s_fneg_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @s_fneg_f32(float addrspace(1)* %out, float %in) { %fneg = fsub float -0.000000e+00, %in store float %fneg, float addrspace(1)* %out ret void @@ -18,7 +18,7 @@ define void @s_fneg_f32(float addrspace(1)* %out, float %in) { ; GCN: v_xor_b32 ; GCN: v_xor_b32 -define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { +define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { %fneg = fsub <2 x float> , %in store <2 x float> %fneg, <2 x float> addrspace(1)* %out ret void @@ -34,7 +34,7 @@ define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> ; GCN: v_xor_b32 ; GCN: v_xor_b32 ; GCN: v_xor_b32 -define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { +define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { %fneg = fsub <4 x float> , %in store <4 x float> %fneg, <4 x float> addrspace(1)* %out ret void @@ -50,7 +50,7 @@ define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> ; R600-NOT: XOR ; R600: -KC0[2].Z -define void @fsub0_f32(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fsub0_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fsub = fsub float 0.0, %bc store float %fsub, float addrspace(1)* %out @@ -66,7 +66,7 @@ define void @fsub0_f32(float addrspace(1)* %out, i32 %in) { ; R600-NOT: XOR ; R600: -PV.W -define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @fneg_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fsub = fsub float -0.0, %bc store float %fsub, float addrspace(1)* %out @@ -78,7 +78,7 @@ define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) { ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: xor ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @fneg_fold_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fneg_fold_f32(float addrspace(1)* %out, float %in) { %fsub = fsub float -0.0, %in %fmul = fmul float %fsub, %in store float %fmul, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fold-cndmask.mir b/test/CodeGen/AMDGPU/fold-cndmask.mir new file mode 100644 index 000000000000..8dfec9166303 --- /dev/null +++ b/test/CodeGen/AMDGPU/fold-cndmask.mir @@ -0,0 +1,34 @@ +# RUN: llc -march=amdgcn -run-pass si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s + +# CHECK: %1 = V_MOV_B32_e32 0, implicit %exec +# CHECK: %2 = V_MOV_B32_e32 0, implicit %exec +# CHECK: %4 = COPY %3 +# CHECK: %5 = V_MOV_B32_e32 0, implicit %exec +# CHECK: %6 = V_MOV_B32_e32 0, implicit %exec +# CHECK: %7 = COPY %3 + +--- +name: fold_cndmask +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } +body: | + bb.0.entry: + %0 = IMPLICIT_DEF + %1 = V_CNDMASK_B32_e64 0, 0, %0, implicit %exec + %2 = V_CNDMASK_B32_e64 %1, %1, %0, implicit %exec + %3 = IMPLICIT_DEF + %4 = V_CNDMASK_B32_e64 %3, %3, %0, implicit %exec + %5 = COPY %1 + %6 = V_CNDMASK_B32_e64 %5, 0, %0, implicit %exec + %vcc = IMPLICIT_DEF + %7 = V_CNDMASK_B32_e32 %3, %3, implicit %exec, implicit %vcc + +... diff --git a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir new file mode 100644 index 000000000000..986c6b296c96 --- /dev/null +++ b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir @@ -0,0 +1,306 @@ +# RUN: llc -march=amdgcn -run-pass peephole-opt -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @no_fold_imm_madak_mac_clamp_f32() #0 { + ret void + } + + define amdgpu_kernel void @no_fold_imm_madak_mac_omod_f32() #0 { + ret void + } + + define amdgpu_kernel void @no_fold_imm_madak_mad_clamp_f32() #0 { + ret void + } + + define amdgpu_kernel void @no_fold_imm_madak_mad_omod_f32() #0 { + ret void + } + + attributes #0 = { nounwind } + +... +--- +# GCN-LABEL: name: no_fold_imm_madak_mac_clamp_f32 +# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec +# GCN-NEXT: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec + +name: no_fold_imm_madak_mac_clamp_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_64_xexec } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: sreg_64 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sgpr_64 } + - { id: 14, class: sgpr_128 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_64 } + - { id: 17, class: sgpr_128 } + - { id: 18, class: sgpr_128 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vreg_64 } + - { id: 23, class: vgpr_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vgpr_32 } + - { id: 26, class: vreg_64 } + - { id: 27, class: vgpr_32 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %28 = REG_SEQUENCE %3, 1, %27, 2 + %11 = S_MOV_B32 61440 + %12 = S_MOV_B32 0 + %13 = REG_SEQUENCE killed %12, 1, killed %11, 2 + %14 = REG_SEQUENCE killed %5, 17, %13, 18 + %15 = S_MOV_B32 2 + %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec + %17 = REG_SEQUENCE killed %6, 17, %13, 18 + %18 = REG_SEQUENCE killed %4, 17, %13, 18 + %20 = COPY %29 + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec + %22 = COPY %29 + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec + %23 = V_MOV_B32_e32 1090519040, implicit %exec + %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec + %26 = COPY %29 + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... +--- +# GCN-LABEL: name: no_fold_imm_madak_mac_omod_f32 +# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec +# GCN: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit %exec + +name: no_fold_imm_madak_mac_omod_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_64_xexec } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: sreg_64 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sgpr_64 } + - { id: 14, class: sgpr_128 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_64 } + - { id: 17, class: sgpr_128 } + - { id: 18, class: sgpr_128 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vreg_64 } + - { id: 23, class: vgpr_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vgpr_32 } + - { id: 26, class: vreg_64 } + - { id: 27, class: vgpr_32 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %28 = REG_SEQUENCE %3, 1, %27, 2 + %11 = S_MOV_B32 61440 + %12 = S_MOV_B32 0 + %13 = REG_SEQUENCE killed %12, 1, killed %11, 2 + %14 = REG_SEQUENCE killed %5, 17, %13, 18 + %15 = S_MOV_B32 2 + %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec + %17 = REG_SEQUENCE killed %6, 17, %13, 18 + %18 = REG_SEQUENCE killed %4, 17, %13, 18 + %20 = COPY %29 + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec + %22 = COPY %29 + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec + %23 = V_MOV_B32_e32 1090519040, implicit %exec + %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit %exec + %26 = COPY %29 + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... +--- +# GCN: name: no_fold_imm_madak_mad_clamp_f32 +# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec +# GCN: %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec + +name: no_fold_imm_madak_mad_clamp_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_64_xexec } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: sreg_64 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sgpr_64 } + - { id: 14, class: sgpr_128 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_64 } + - { id: 17, class: sgpr_128 } + - { id: 18, class: sgpr_128 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vreg_64 } + - { id: 23, class: vgpr_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vgpr_32 } + - { id: 26, class: vreg_64 } + - { id: 27, class: vgpr_32 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %28 = REG_SEQUENCE %3, 1, %27, 2 + %11 = S_MOV_B32 61440 + %12 = S_MOV_B32 0 + %13 = REG_SEQUENCE killed %12, 1, killed %11, 2 + %14 = REG_SEQUENCE killed %5, 17, %13, 18 + %15 = S_MOV_B32 2 + %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec + %17 = REG_SEQUENCE killed %6, 17, %13, 18 + %18 = REG_SEQUENCE killed %4, 17, %13, 18 + %20 = COPY %29 + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec + %22 = COPY %29 + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec + %23 = V_MOV_B32_e32 1090519040, implicit %exec + %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec + %26 = COPY %29 + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... +--- +# GCN: name: no_fold_imm_madak_mad_omod_f32 +# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec +# GCN: %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit %exec + +name: no_fold_imm_madak_mad_omod_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: sreg_64_xexec } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32_xm0 } + - { id: 10, class: sreg_64 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sgpr_64 } + - { id: 14, class: sgpr_128 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_64 } + - { id: 17, class: sgpr_128 } + - { id: 18, class: sgpr_128 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vreg_64 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vreg_64 } + - { id: 23, class: vgpr_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vgpr_32 } + - { id: 26, class: vreg_64 } + - { id: 27, class: vgpr_32 } + - { id: 28, class: vreg_64 } + - { id: 29, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0 (%ir-block.0): + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY %vgpr0 + %0 = COPY %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec + %28 = REG_SEQUENCE %3, 1, %27, 2 + %11 = S_MOV_B32 61440 + %12 = S_MOV_B32 0 + %13 = REG_SEQUENCE killed %12, 1, killed %11, 2 + %14 = REG_SEQUENCE killed %5, 17, %13, 18 + %15 = S_MOV_B32 2 + %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec + %17 = REG_SEQUENCE killed %6, 17, %13, 18 + %18 = REG_SEQUENCE killed %4, 17, %13, 18 + %20 = COPY %29 + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec + %22 = COPY %29 + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec + %23 = V_MOV_B32_e32 1090519040, implicit %exec + %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit %exec + %26 = COPY %29 + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/fp-classify.ll b/test/CodeGen/AMDGPU/fp-classify.ll index b7ffaed70c5a..cbc42979f2ee 100644 --- a/test/CodeGen/AMDGPU/fp-classify.ll +++ b/test/CodeGen/AMDGPU/fp-classify.ll @@ -9,7 +9,7 @@ declare double @llvm.fabs.f64(double) #1 ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] ; SI-NOT: v_cmp ; SI: s_endpgm -define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 { %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 %ext = zext i1 %cmp to i32 @@ -20,7 +20,7 @@ define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 { ; SI-LABEL: {{^}}test_not_isinf_pattern_0: ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp ueq float %fabs, 0x7FF0000000000000 %ext = zext i1 %cmp to i32 @@ -31,7 +31,7 @@ define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x ; SI-LABEL: {{^}}test_not_isinf_pattern_1: ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp oeq float %fabs, 0xFFF0000000000000 %ext = zext i1 %cmp to i32 @@ -45,7 +45,7 @@ define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] ; SI-NOT: v_cmp ; SI: s_endpgm -define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 @@ -59,7 +59,7 @@ define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) ; SI-LABEL: {{^}}test_isfinite_not_pattern_0: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp une float %x.fabs, 0xFFF0000000000000 @@ -73,7 +73,7 @@ define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float ; SI-LABEL: {{^}}test_isfinite_not_pattern_1: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp ord float %x, 0.000000e+00 %ninf = fcmp une float %x, 0x7FF0000000000000 %and = and i1 %ord, %ninf @@ -86,7 +86,7 @@ define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float ; SI-LABEL: {{^}}test_isfinite_not_pattern_2: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 { %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %y) #1 %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 @@ -100,7 +100,7 @@ define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float ; SI-LABEL: {{^}}test_isfinite_not_pattern_3: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp uno float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 @@ -114,7 +114,7 @@ define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float ; SI-LABEL: {{^}}test_isfinite_not_pattern_4: ; SI-NOT: v_cmp_class_f32 ; SI: s_endpgm -define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 { +define amdgpu_kernel void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 { %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp one float %x.fabs, 0x7FF0000000000000 diff --git a/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/test/CodeGen/AMDGPU/fp16_to_fp32.ll index 01bc53ff35a5..ce041364b76d 100644 --- a/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -14,7 +14,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone ; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]] ; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]] ; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]] -define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { %val = load i16, i16 addrspace(1)* %in, align 2 %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone store float %cvt, float addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/test/CodeGen/AMDGPU/fp16_to_fp64.ll index a9f493bf0ccd..70f0c0c1afdb 100644 --- a/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone ; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]] ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]] ; GCN: buffer_store_dwordx2 [[RESULT]] -define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { %val = load i16, i16 addrspace(1)* %in, align 2 %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone store double %cvt, double addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll index 3e426e3e94b1..2c6b1cb18f7e 100644 --- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -12,7 +12,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone ; EG: MEM_RAT MSKOR ; EG: VTX_READ_32 ; EG: FLT32_TO_FLT16 -define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone store i16 %cvt, i16 addrspace(1)* %out, align 2 diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll index 1537d67cadcc..a7cddd09b762 100644 --- a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll +++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll @@ -6,7 +6,7 @@ declare double @llvm.fabs.f64(double) #1 ; FUNC-LABEL: @fp_to_sint_f64_i32 ; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) { %result = fptosi double %in to i32 store i32 %result, i32 addrspace(1)* %out ret void @@ -15,7 +15,7 @@ define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) { ; FUNC-LABEL: @fp_to_sint_v2f64_v2i32 ; SI: v_cvt_i32_f64_e32 ; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) { %result = fptosi <2 x double> %in to <2 x i32> store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void @@ -26,7 +26,7 @@ define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> % ; SI: v_cvt_i32_f64_e32 ; SI: v_cvt_i32_f64_e32 ; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) { %result = fptosi <4 x double> %in to <4 x i32> store <4 x i32> %result, <4 x i32> addrspace(1)* %out ret void @@ -47,7 +47,7 @@ define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> % ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] ; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid %val = load double, double addrspace(1)* %gep, align 8 @@ -58,7 +58,7 @@ define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in ; FUNC-LABEL: {{^}}fp_to_sint_f64_to_i1: ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{\[[0-9]+:[0-9]+\]}} -define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { +define amdgpu_kernel void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { %conv = fptosi double %in to i1 store i1 %conv, i1 addrspace(1)* %out ret void @@ -66,7 +66,7 @@ define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { ; FUNC-LABEL: {{^}}fp_to_sint_fabs_f64_to_i1: ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{\[[0-9]+:[0-9]+\]}}| -define void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { +define amdgpu_kernel void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { %in.fabs = call double @llvm.fabs.f64(double %in) %conv = fptosi double %in.fabs to i1 store i1 %conv, i1 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll index a2fa7a190745..630a7186e101 100644 --- a/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -8,7 +8,7 @@ declare float @llvm.fabs.f32(float) #1 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} ; SI: v_cvt_i32_f32_e32 ; SI: s_endpgm -define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { %conv = fptosi float %in to i32 store i32 %conv, i32 addrspace(1)* %out ret void @@ -16,7 +16,7 @@ define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { ; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs: ; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} -define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { %in.fabs = call float @llvm.fabs.f32(float %in) %conv = fptosi float %in.fabs to i32 store i32 %conv, i32 addrspace(1)* %out @@ -28,7 +28,7 @@ define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} ; SI: v_cvt_i32_f32_e32 ; SI: v_cvt_i32_f32_e32 -define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { %result = fptosi <2 x float> %in to <2 x i32> store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void @@ -43,7 +43,7 @@ define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { ; SI: v_cvt_i32_f32_e32 ; SI: v_cvt_i32_f32_e32 ; SI: v_cvt_i32_f32_e32 -define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %value = load <4 x float>, <4 x float> addrspace(1) * %in %result = fptosi <4 x float> %value to <4 x i32> store <4 x i32> %result, <4 x i32> addrspace(1)* %out @@ -76,7 +76,7 @@ define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspac ; Check that the compiler doesn't crash with a "cannot select" error ; SI: s_endpgm -define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { entry: %0 = fptosi float %in to i64 store i64 %0, i64 addrspace(1)* %out @@ -128,7 +128,7 @@ entry: ; EG-DAG: CNDE_INT ; SI: s_endpgm -define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { +define amdgpu_kernel void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { %conv = fptosi <2 x float> %x to <2 x i64> store <2 x i64> %conv, <2 x i64> addrspace(1)* %out ret void @@ -221,7 +221,7 @@ define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { ; EG-DAG: CNDE_INT ; SI: s_endpgm -define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { +define amdgpu_kernel void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { %conv = fptosi <4 x float> %x to <4 x i64> store <4 x i64> %conv, <4 x i64> addrspace(1)* %out ret void @@ -233,7 +233,7 @@ define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { ; EG: AND_INT ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, literal.y, ; EG-NEXT: -1082130432(-1.000000e+00) -define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { %conv = fptosi float %in to i1 store i1 %conv, i1 addrspace(1)* %out ret void @@ -241,7 +241,7 @@ define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1: ; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{[0-9]+}}| -define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { %in.fabs = call float @llvm.fabs.f32(float %in) %conv = fptosi float %in.fabs to i1 store i1 %conv, i1 addrspace(1)* %out @@ -251,7 +251,7 @@ define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { ; FUNC-LABEL: {{^}}fp_to_sint_f32_i16: ; GCN: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_short [[VAL]] -define void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 { %sint = fptosi float %in to i16 store i16 %sint, i16 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll index d5bc416434df..4f597eb3f32c 100644 --- a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll +++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll @@ -6,7 +6,7 @@ declare double @llvm.fabs.f64(double) #1 ; SI-LABEL: {{^}}fp_to_uint_i32_f64: ; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) { %cast = fptoui double %in to i32 store i32 %cast, i32 addrspace(1)* %out, align 4 ret void @@ -15,7 +15,7 @@ define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) { ; SI-LABEL: @fp_to_uint_v2i32_v2f64 ; SI: v_cvt_u32_f64_e32 ; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) { %cast = fptoui <2 x double> %in to <2 x i32> store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8 ret void @@ -26,7 +26,7 @@ define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> % ; SI: v_cvt_u32_f64_e32 ; SI: v_cvt_u32_f64_e32 ; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) { %cast = fptoui <4 x double> %in to <4 x i32> store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8 ret void @@ -47,7 +47,7 @@ define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> % ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] ; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid %val = load double, double addrspace(1)* %gep, align 8 @@ -57,14 +57,14 @@ define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in } ; SI-LABEL: @fp_to_uint_v2i64_v2f64 -define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) { %cast = fptoui <2 x double> %in to <2 x i64> store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16 ret void } ; SI-LABEL: @fp_to_uint_v4i64_v4f64 -define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) { %cast = fptoui <4 x double> %in to <4 x i64> store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32 ret void @@ -72,7 +72,7 @@ define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> % ; FUNC-LABEL: {{^}}fp_to_uint_f64_to_i1: ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{\[[0-9]+:[0-9]+\]}} -define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { +define amdgpu_kernel void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { %conv = fptoui double %in to i1 store i1 %conv, i1 addrspace(1)* %out ret void @@ -80,7 +80,7 @@ define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f64_to_i1: ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{\[[0-9]+:[0-9]+\]}}| -define void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { +define amdgpu_kernel void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { %in.fabs = call double @llvm.fabs.f64(double %in) %conv = fptoui double %in.fabs to i1 store i1 %conv, i1 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll index cbff9f22b073..fdb15801dc4e 100644 --- a/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) #1 ; GCN: v_cvt_u32_f32_e32 ; GCN: s_endpgm -define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) { %conv = fptoui float %in to i32 store i32 %conv, i32 addrspace(1)* %out ret void @@ -21,7 +21,7 @@ define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) { ; GCN: v_cvt_u32_f32_e32 ; GCN: v_cvt_u32_f32_e32 -define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { %result = fptoui <2 x float> %in to <2 x i32> store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void @@ -37,7 +37,7 @@ define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> ; GCN: v_cvt_u32_f32_e32 ; GCN: v_cvt_u32_f32_e32 -define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %value = load <4 x float>, <4 x float> addrspace(1) * %in %result = fptoui <4 x float> %value to <4 x i32> store <4 x i32> %result, <4 x i32> addrspace(1)* %out @@ -68,7 +68,7 @@ define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> ; EG-DAG: CNDE_INT ; GCN: s_endpgm -define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) { +define amdgpu_kernel void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) { %conv = fptoui float %x to i64 store i64 %conv, i64 addrspace(1)* %out ret void @@ -119,7 +119,7 @@ define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) { ; EG-DAG: CNDE_INT ; GCN: s_endpgm -define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { +define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { %conv = fptoui <2 x float> %x to <2 x i64> store <2 x i64> %conv, <2 x i64> addrspace(1)* %out ret void @@ -212,7 +212,7 @@ define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> ; EG-DAG: CNDE_INT ; GCN: s_endpgm -define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { +define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { %conv = fptoui <4 x float> %x to <4 x i64> store <4 x i64> %conv, <4 x i64> addrspace(1)* %out ret void @@ -224,7 +224,7 @@ define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> ; EG: AND_INT ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, 1.0, -define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { %conv = fptoui float %in to i1 store i1 %conv, i1 addrspace(1)* %out ret void @@ -232,7 +232,7 @@ define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1: ; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{[0-9]+}}| -define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { %in.fabs = call float @llvm.fabs.f32(float %in) %conv = fptoui float %in.fabs to i1 store i1 %conv, i1 addrspace(1)* %out @@ -246,7 +246,7 @@ define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { ; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_short [[VAL]] -define void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 { %uint = fptoui float %in to i16 store i16 %uint, i16 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll index c4f5d7cdfb5d..03657176c383 100644 --- a/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,14 +1,15 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s ; GCN-LABEL: {{^}}fpext_f16_to_f32 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_dword v[[R_F32]] ; GCN: s_endpgm -define void @fpext_f16_to_f32( +define amdgpu_kernel void @fpext_f16_to_f32( float addrspace(1)* %r, - half addrspace(1)* %a) { + half addrspace(1)* %a) #0 { entry: %a.val = load half, half addrspace(1)* %a %r.val = fpext half %a.val to float @@ -22,9 +23,9 @@ entry: ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}} ; GCN: s_endpgm -define void @fpext_f16_to_f64( +define amdgpu_kernel void @fpext_f16_to_f64( double addrspace(1)* %r, - half addrspace(1)* %a) { + half addrspace(1)* %a) #0 { entry: %a.val = load half, half addrspace(1)* %a %r.val = fpext half %a.val to double @@ -34,15 +35,17 @@ entry: ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] +; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] +; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}} ; GCN: s_endpgm -define void @fpext_v2f16_to_v2f32( + +define amdgpu_kernel void @fpext_v2f16_to_v2f32( <2 x float> addrspace(1)* %r, - <2 x half> addrspace(1)* %a) { + <2 x half> addrspace(1)* %a) #0 { entry: %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = fpext <2 x half> %a.val to <2 x float> @@ -51,15 +54,18 @@ entry: } ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64 -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_cvt_f64_f32_e32 v{{\[}}{{[0-9]+}}:[[R_F64_3:[0-9]+]]{{\]}}, v[[A_F32_1]] -; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:{{[0-9]+}}{{\]}}, v[[A_F32_0]] -; GCN: buffer_store_dwordx4 v{{\[}}[[R_F64_0]]:[[R_F64_3]]{{\]}} +; GCN: buffer_load_dword +; SIGFX9-DAG: v_lshrrev_b32_e32 +; SIGFX9-DAG: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa +; GCN: v_cvt_f32_f16_e32 + +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @fpext_v2f16_to_v2f64( + +define amdgpu_kernel void @fpext_v2f16_to_v2f64( <2 x double> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -68,3 +74,202 @@ entry: store <2 x double> %r.val, <2 x double> addrspace(1)* %r ret void } + +; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32: +; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} +define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) { +entry: + %a.trunc = trunc i32 %a to i16 + %a.val = bitcast i16 %a.trunc to half + %r.val = fpext half %a.val to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]] +define amdgpu_kernel void @fneg_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.neg = fsub half -0.0, %a.val + %r.val = fpext half %a.neg to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]| +define amdgpu_kernel void @fabs_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = fpext half %a.fabs to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]| +define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %a.fneg.fabs = fsub half -0.0, %a.fabs + %r.val = fpext half %a.fneg.fabs to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_multi_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[A]] + +; FIXME: Using the source modifier here only wastes code size +; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] + +; GCN: store_dword [[CVT]] +; GCN: store_short [[XOR]] +define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.neg = fsub half -0.0, %a.val + %r.val = fpext half %a.neg to float + store volatile float %r.val, float addrspace(1)* %r + store volatile half %a.neg, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fneg_multi_foldable_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] +; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]] +; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] + +; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]] +; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]] + +; GCN: buffer_store_dword [[CVTA_NEG]] +; GCN: buffer_store_short [[MUL]] +define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.neg = fsub half -0.0, %a.val + %r.val = fpext half %a.neg to float + %mul = fmul half %a.neg, %a.val + store volatile float %r.val, float addrspace(1)* %r + store volatile half %mul, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_multi_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_and_b32_e32 [[XOR:v[0-9]+]], 0x7fff, [[A]] + +; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |[[A]]| + +; GCN: store_dword [[CVT]] +; GCN: store_short [[XOR]] +define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = fpext half %a.fabs to float + store volatile float %r.val, float addrspace(1)* %r + store volatile half %a.fabs, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_multi_foldable_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] +; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], |[[CVTA]]|, [[CVTA]] +; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] +; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]] + +; GFX89-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]| +; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]] + +; GCN: buffer_store_dword [[ABS_A]] +; GCN: buffer_store_short [[MUL]] +define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = fpext half %a.fabs to float + %mul = fmul half %a.fabs, %a.val + store volatile float %r.val, float addrspace(1)* %r + store volatile half %mul, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_fneg_multi_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x8000, [[A]] + +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[OR]] +; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[OR]]| + +; GCN: buffer_store_dword [[CVT]] +; GCN: buffer_store_short [[OR]] +define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %a.fneg.fabs = fsub half -0.0, %a.fabs + %r.val = fpext half %a.fneg.fabs to float + store volatile float %r.val, float addrspace(1)* %r + store volatile half %a.fneg.fabs, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] +; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], -|[[CVTA]]|, [[CVTA]] +; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] +; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]] + +; GFX89-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]| +; GFX89-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]] + +; GCN: buffer_store_dword [[FABS_FNEG]] +; GCN: buffer_store_short [[MUL]] +define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %a.fneg.fabs = fsub half -0.0, %a.fabs + %r.val = fpext half %a.fneg.fabs to float + %mul = fmul half %a.fneg.fabs, %a.val + store volatile float %r.val, float addrspace(1)* %r + store volatile half %mul, half addrspace(1)* undef + ret void +} + +declare half @llvm.fabs.f16(half) #1 + +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fpext.ll b/test/CodeGen/AMDGPU/fpext.ll index 6dc84b01d734..b11e2ea056c3 100644 --- a/test/CodeGen/AMDGPU/fpext.ll +++ b/test/CodeGen/AMDGPU/fpext.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}fpext_f32_to_f64: ; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) { %result = fpext float %in to double store double %result, double addrspace(1)* %out ret void @@ -12,7 +12,7 @@ define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) { ; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64: ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 -define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) { %result = fpext <2 x float> %in to <2 x double> store <2 x double> %result, <2 x double> addrspace(1)* %out ret void @@ -22,7 +22,7 @@ define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> % ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 -define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) { +define amdgpu_kernel void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) { %result = fpext <3 x float> %in to <3 x double> store <3 x double> %result, <3 x double> addrspace(1)* %out ret void @@ -33,7 +33,7 @@ define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> % ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 -define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) { %result = fpext <4 x float> %in to <4 x double> store <4 x double> %result, <4 x double> addrspace(1)* %out ret void @@ -48,7 +48,7 @@ define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> % ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 ; SI: v_cvt_f64_f32_e32 -define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) { +define amdgpu_kernel void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) { %result = fpext <8 x float> %in to <8 x double> store <8 x double> %result, <8 x double> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/fptosi.f16.ll b/test/CodeGen/AMDGPU/fptosi.f16.ll index 71f56d730e96..50e56e08416a 100644 --- a/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -7,7 +7,7 @@ ; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_I16]] ; GCN: s_endpgm -define void @fptosi_f16_to_i16( +define amdgpu_kernel void @fptosi_f16_to_i16( i16 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -23,7 +23,7 @@ entry: ; GCN: v_cvt_i32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fptosi_f16_to_i32( +define amdgpu_kernel void @fptosi_f16_to_i32( i32 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -40,7 +40,7 @@ entry: ; GCN: buffer_load_ushort ; GCN: v_cvt_f32_f16_e32 ; GCN: s_endpgm -define void @fptosi_f16_to_i64( +define amdgpu_kernel void @fptosi_f16_to_i64( i64 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -52,17 +52,26 @@ entry: ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] -; GCN: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] -; GCN: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] -; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]] + +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] +; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]] +; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] +; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]] + +; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] +; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + ; GCN: buffer_store_dword v[[R_V2_I16]] ; GCN: s_endpgm -define void @fptosi_v2f16_to_v2i16( + +define amdgpu_kernel void @fptosi_v2f16_to_v2i16( <2 x i16> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -75,12 +84,13 @@ entry: ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32 ; GCN: buffer_load_dword ; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa ; GCN: v_cvt_i32_f32_e32 ; GCN: v_cvt_i32_f32_e32 ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @fptosi_v2f16_to_v2i32( +define amdgpu_kernel void @fptosi_v2f16_to_v2i32( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -96,9 +106,10 @@ entry: ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64 ; GCN: buffer_load_dword ; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa ; GCN: s_endpgm -define void @fptosi_v2f16_to_v2i64( +define amdgpu_kernel void @fptosi_v2f16_to_v2i64( <2 x i64> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/fptoui.f16.ll b/test/CodeGen/AMDGPU/fptoui.f16.ll index a6876624a0c6..2afa6111cf17 100644 --- a/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -8,7 +8,7 @@ ; VI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_I16]] ; GCN: s_endpgm -define void @fptoui_f16_to_i16( +define amdgpu_kernel void @fptoui_f16_to_i16( i16 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -24,7 +24,7 @@ entry: ; GCN: v_cvt_u32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @fptoui_f16_to_i32( +define amdgpu_kernel void @fptoui_f16_to_i32( i32 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -41,7 +41,7 @@ entry: ; GCN: buffer_load_ushort ; GCN: v_cvt_f32_f16_e32 ; GCN: s_endpgm -define void @fptoui_f16_to_i64( +define amdgpu_kernel void @fptoui_f16_to_i64( i64 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -53,18 +53,25 @@ entry: ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] + +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] -; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] +; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] +; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]] + +; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] -; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] -; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]] +; VI: v_cvt_i32_f32_sdwa v[[R_I16_0:[0-9]+]], v[[A_F32_0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + ; GCN: buffer_store_dword v[[R_V2_I16]] ; GCN: s_endpgm -define void @fptoui_v2f16_to_v2i16( + +define amdgpu_kernel void @fptoui_v2f16_to_v2i16( <2 x i16> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -77,12 +84,13 @@ entry: ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i32 ; GCN: buffer_load_dword ; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa ; GCN: v_cvt_u32_f32_e32 ; GCN: v_cvt_u32_f32_e32 ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @fptoui_v2f16_to_v2i32( +define amdgpu_kernel void @fptoui_v2f16_to_v2i32( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -98,9 +106,10 @@ entry: ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64 ; GCN: buffer_load_dword ; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa ; GCN: s_endpgm -define void @fptoui_v2f16_to_v2i64( +define amdgpu_kernel void @fptoui_v2f16_to_v2i64( <2 x i64> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll index 284fc53c8240..bc72f4424c98 100644 --- a/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,12 +1,13 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; GCN-LABEL: {{^}}fptrunc_f32_to_f16 +; GCN-LABEL: {{^}}fptrunc_f32_to_f16: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fptrunc_f32_to_f16( +define amdgpu_kernel void @fptrunc_f32_to_f16( half addrspace(1)* %r, float addrspace(1)* %a) { entry: @@ -16,13 +17,13 @@ entry: ret void } -; GCN-LABEL: {{^}}fptrunc_f64_to_f16 +; GCN-LABEL: {{^}}fptrunc_f64_to_f16: ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}} ; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}} ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fptrunc_f64_to_f16( +define amdgpu_kernel void @fptrunc_f64_to_f16( half addrspace(1)* %r, double addrspace(1)* %a) { entry: @@ -32,16 +33,24 @@ entry: ret void } -; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16 +; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16: ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] -; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; GCN-DAG: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + +; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fptrunc_v2f32_to_v2f16( + +define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( <2 x half> addrspace(1)* %r, <2 x float> addrspace(1)* %a) { entry: @@ -51,17 +60,23 @@ entry: ret void } -; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16 +; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16: ; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}} -; GCN: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}} -; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} -; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}} +; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} +; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] + +; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD + +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] + ; GCN: buffer_store_dword v[[R_V2_F16]] -define void @fptrunc_v2f64_to_v2f16( + +define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( <2 x half> addrspace(1)* %r, <2 x double> addrspace(1)* %a) { entry: @@ -70,3 +85,109 @@ entry: store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +; GCN-LABEL: {{^}}fneg_fptrunc_f32_to_f16: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( + half addrspace(1)* %r, + float addrspace(1)* %a) { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fneg = fsub float -0.0, %a.val + %r.val = fptrunc float %a.fneg to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fabs_fptrunc_f32_to_f16: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( + half addrspace(1)* %r, + float addrspace(1)* %a) { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fabs = call float @llvm.fabs.f32(float %a.val) + %r.val = fptrunc float %a.fabs to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_fabs_fptrunc_f32_to_f16: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]| +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( + half addrspace(1)* %r, + float addrspace(1)* %a) #0 { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fabs = call float @llvm.fabs.f32(float %a.val) + %a.fneg.fabs = fsub float -0.0, %a.fabs + %r.val = fptrunc float %a.fneg.fabs to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; GCN-NOT: v[[R_F16]] +; GCN: buffer_store_dword v[[R_F16]] +define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( + i32 addrspace(1)* %r, + float addrspace(1)* %a) #0 { +entry: + %a.val = load float, float addrspace(1)* %a + %r.val = fptrunc float %a.val to half + %r.i16 = bitcast half %r.val to i16 + %zext = zext i16 %r.i16 to i32 + store i32 %zext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| +; GCN-NOT: v[[R_F16]] +; GCN: buffer_store_dword v[[R_F16]] +define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( + i32 addrspace(1)* %r, + float addrspace(1)* %a) #0 { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fabs = call float @llvm.fabs.f32(float %a.val) + %r.val = fptrunc float %a.fabs to half + %r.i16 = bitcast half %r.val to i16 + %zext = zext i16 %r.i16 to i32 + store i32 %zext, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fptrunc_f32_to_f16_sext_i32: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] +; GCN: v_bfe_i32 v[[R_F16_SEXT:[0-9]+]], v[[R_F16]], 0, 16 +; GCN: buffer_store_dword v[[R_F16_SEXT]] +define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( + i32 addrspace(1)* %r, + float addrspace(1)* %a) #0 { +entry: + %a.val = load float, float addrspace(1)* %a + %r.val = fptrunc float %a.val to half + %r.i16 = bitcast half %r.val to i16 + %zext = sext i16 %r.i16 to i32 + store i32 %zext, i32 addrspace(1)* %r + ret void +} + +declare float @llvm.fabs.f32(float) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fptrunc.ll b/test/CodeGen/AMDGPU/fptrunc.ll index 0c7b67406a89..d9c5b7e6f359 100644 --- a/test/CodeGen/AMDGPU/fptrunc.ll +++ b/test/CodeGen/AMDGPU/fptrunc.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}fptrunc_f64_to_f32: ; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} -define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { %result = fptrunc double %in to float store float %result, float addrspace(1)* %out ret void @@ -14,7 +14,7 @@ define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { ; GCN-NOT: v_cvt ; GCN-UNSAFE: v_cvt_f32_f64_e32 [[F32:v[0-9]+]] ; GCN-UNSAFE: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[F32]] -define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) { +define amdgpu_kernel void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) { %result = fptrunc double %in to half %result_i16 = bitcast half %result to i16 store i16 %result_i16, i16 addrspace(1)* %out @@ -24,7 +24,7 @@ define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) { ; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32: ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 -define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { %result = fptrunc <2 x double> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void @@ -35,7 +35,7 @@ define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 -define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { %result = fptrunc <4 x double> %in to <4 x float> store <4 x float> %result, <4 x float> addrspace(1)* %out ret void @@ -50,7 +50,7 @@ define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 ; GCN: v_cvt_f32_f64_e32 -define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { +define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { %result = fptrunc <8 x double> %in to <8 x float> store <8 x float> %result, <8 x float> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll index 0651dce8d95c..7a5bcfffa3f3 100644 --- a/test/CodeGen/AMDGPU/fract.f64.ll +++ b/test/CodeGen/AMDGPU/fract.f64.ll @@ -27,7 +27,7 @@ declare double @llvm.floor.f64(double) #0 ; GCN-UNSAFE: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]] ; GCN: buffer_store_dwordx2 [[FRACT]] -define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 { %x = load double, double addrspace(1)* %src %floor.x = call double @llvm.floor.f64(double %x) %fract = fsub double %x, %floor.x @@ -54,7 +54,7 @@ define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]] ; GCN: buffer_store_dwordx2 [[FRACT]] -define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 { %x = load double, double addrspace(1)* %src %neg.x = fsub double -0.0, %x %floor.neg.x = call double @llvm.floor.f64(double %neg.x) @@ -82,7 +82,7 @@ define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]| ; GCN: buffer_store_dwordx2 [[FRACT]] -define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 { %x = load double, double addrspace(1)* %src %abs.x = call double @llvm.fabs.f64(double %x) %neg.abs.x = fsub double -0.0, %abs.x @@ -98,7 +98,7 @@ define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* % ; VI-UNSAFE-DAG: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]] ; VI-UNSAFE: buffer_store_dwordx2 [[FLOOR]] ; VI-UNSAFE: buffer_store_dwordx2 [[FRACT]] -define void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 { +define amdgpu_kernel void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 { %x = load double, double addrspace(1)* %src %floor.x = call double @llvm.floor.f64(double %x) %fract = fsub double %x, %floor.x diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll index 4e1a503b1298..207fe280c9a6 100644 --- a/test/CodeGen/AMDGPU/fract.ll +++ b/test/CodeGen/AMDGPU/fract.ll @@ -14,7 +14,7 @@ declare float @llvm.floor.f32(float) #0 ; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] ; GCN: buffer_store_dword [[RESULT]] -define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 { %x = load float, float addrspace(1)* %src %floor.x = call float @llvm.floor.f32(float %x) %fract = fsub float %x, %floor.x @@ -29,7 +29,7 @@ define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 { ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]] ; GCN: buffer_store_dword [[RESULT]] -define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 { %x = load float, float addrspace(1)* %src %x.neg = fsub float -0.0, %x %floor.x.neg = call float @llvm.floor.f32(float %x.neg) @@ -45,7 +45,7 @@ define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) # ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]| ; GCN: buffer_store_dword [[RESULT]] -define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 { +define amdgpu_kernel void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 { %x = load float, float addrspace(1)* %src %abs.x = call float @llvm.fabs.f32(float %x) %neg.abs.x = fsub float -0.0, %abs.x @@ -61,7 +61,7 @@ define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %sr ; GCN-UNSAFE: buffer_store_dword [[FLOOR]] ; GCN-UNSAFE: buffer_store_dword [[FRACT]] -define void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 { +define amdgpu_kernel void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 { %x = load float, float addrspace(1)* %src %floor.x = call float @llvm.floor.f32(float %x) %fract = fsub float %x, %floor.x diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll index 039623c02194..9778069d0477 100644 --- a/test/CodeGen/AMDGPU/frem.ll +++ b/test/CodeGen/AMDGPU/frem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}frem_f32: ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} @@ -12,10 +12,10 @@ ; GCN: v_mul_f32_e32 ; GCN: v_div_fmas_f32 ; GCN: v_div_fixup_f32 -; GCN: v_trunc_f32_e32 -; GCN: v_mad_f32 +; GCN: v_trunc_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -33,8 +33,7 @@ define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]] ; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]] ; GCN: buffer_store_dword [[RESULT]] -; GCN: s_endpgm -define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, +define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #1 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -55,7 +54,7 @@ define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, ; GCN: v_add_f64 ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -71,7 +70,7 @@ define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; CI: v_trunc_f64_e32 ; GCN: v_fma_f64 ; GCN: s_endpgm -define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #1 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -80,7 +79,7 @@ define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in ret void } -define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, +define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, <2 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 @@ -90,7 +89,7 @@ define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1) ret void } -define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, +define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, <4 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 @@ -100,7 +99,7 @@ define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1) ret void } -define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, +define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll index ed040436a61a..453d8fb37f2f 100644 --- a/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64: ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 { %r0 = load double, double addrspace(1)* %in %r1 = call double @llvm.sqrt.f64(double %r0) store double %r1, double addrspace(1)* %out @@ -12,7 +12,7 @@ define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %i ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f64: ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 { +define amdgpu_kernel void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 { %r0 = load double, double addrspace(1)* %in %r1 = call double @llvm.sqrt.f64(double %r0) store double %r1, double addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll index b6526b8e0787..a0fd3411ca05 100644 --- a/test/CodeGen/AMDGPU/fsqrt.ll +++ b/test/CodeGen/AMDGPU/fsqrt.ll @@ -7,7 +7,7 @@ ; FUNC-LABEL: {{^}}v_safe_fsqrt_f32: ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}} -define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 { %r0 = load float, float addrspace(1)* %in %r1 = call float @llvm.sqrt.f32(float %r0) store float %r1, float addrspace(1)* %out @@ -16,7 +16,7 @@ define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32: ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}} -define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 { +define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 { %r0 = load float, float addrspace(1)* %in %r1 = call float @llvm.sqrt.f32(float %r0) store float %r1, float addrspace(1)* %out @@ -29,7 +29,7 @@ define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %i ; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z ; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS -define void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 { entry: %fdiv = call float @llvm.sqrt.f32(float %in) store float %fdiv, float addrspace(1)* %out @@ -44,7 +44,7 @@ entry: ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS -define void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) store <2 x float> %fdiv, <2 x float> addrspace(1)* %out @@ -65,7 +65,7 @@ entry: ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS -define void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { +define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { entry: %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) store <4 x float> %fdiv, <4 x float> addrspace(1)* %out @@ -75,7 +75,7 @@ entry: ; FUNC-LABEL: {{^}}elim_redun_check_neg0: ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 { entry: %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp olt float %in, -0.000000e+00 @@ -87,7 +87,7 @@ entry: ; FUNC-LABEL: {{^}}elim_redun_check_pos0: ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 { entry: %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp olt float %in, 0.000000e+00 @@ -99,7 +99,7 @@ entry: ; FUNC-LABEL: {{^}}elim_redun_check_ult: ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 { +define amdgpu_kernel void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 { entry: %sqrt = call float @llvm.sqrt.f32(float %in) %cmp = fcmp ult float %in, -0.000000e+00 @@ -112,7 +112,7 @@ entry: ; GCN: v_sqrt_f32_e32 ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +define amdgpu_kernel void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) %cmp = fcmp olt <2 x float> %in, @@ -125,7 +125,7 @@ entry: ; GCN: v_sqrt_f32_e32 ; GCN: v_sqrt_f32_e32 ; GCN-NOT: v_cndmask -define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +define amdgpu_kernel void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) %cmp = fcmp ult <2 x float> %in, diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll index 0b3c8ac2503d..d3c5df317771 100644 --- a/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/test/CodeGen/AMDGPU/fsub.f16.ll @@ -1,17 +1,18 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s -; GCN-LABEL: {{^}}fsub_f16 +; GCN-LABEL: {{^}}fsub_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GFX89: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fsub_f16( +define amdgpu_kernel void @fsub_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -23,16 +24,15 @@ entry: ret void } -; GCN-LABEL: {{^}}fsub_f16_imm_a +; GCN-LABEL: {{^}}fsub_f16_imm_a: ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3c00{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] +; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fsub_f16_imm_a( +define amdgpu_kernel void @fsub_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -42,16 +42,15 @@ entry: ret void } -; GCN-LABEL: {{^}}fsub_f16_imm_b +; GCN-LABEL: {{^}}fsub_f16_imm_b: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0xc000{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], -2.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]] +; GFX89: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fsub_f16_imm_b( +define amdgpu_kernel void @fsub_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -61,27 +60,33 @@ entry: ret void } -; GCN-LABEL: {{^}}fsub_v2f16 +; GCN-LABEL: {{^}}fsub_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_subrev_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fsub_v2f16( + +define amdgpu_kernel void @fsub_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -93,25 +98,32 @@ entry: ret void } -; GCN-LABEL: {{^}}fsub_v2f16_imm_a +; GCN-LABEL: {{^}}fsub_v2f16_imm_a: ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x3c00{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4000{{$}} + ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] +; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00 +; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fsub_v2f16_imm_a( + +define amdgpu_kernel void @fsub_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -121,25 +133,32 @@ entry: ret void } -; GCN-LABEL: {{^}}fsub_v2f16_imm_b +; GCN-LABEL: {{^}}fsub_v2f16_imm_b: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4000{{$}} -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x3c00{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] -; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]] +; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000 +; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}} + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fsub_v2f16_imm_b( + +define amdgpu_kernel void @fsub_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll index 3429df33c015..e7a92d95d485 100644 --- a/test/CodeGen/AMDGPU/fsub.ll +++ b/test/CodeGen/AMDGPU/fsub.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}v_fsub_f32: ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %a = load float, float addrspace(1)* %in, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 @@ -17,23 +17,19 @@ define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) { %sub = fsub float %a, %b store float %sub, float addrspace(1)* %out, align 4 ret void } -declare float @llvm.r600.load.input(i32) readnone - -declare void @llvm.AMDGPU.store.output(float, i32) - ; FUNC-LABEL: {{^}}fsub_v2f32: ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { %sub = fsub <2 x float> %a, %b store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 ret void @@ -49,7 +45,7 @@ define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 @@ -64,8 +60,75 @@ define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace( ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: s_endpgm -define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { +define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { %result = fsub <4 x float> %a, %b store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 ret void } + +; FUNC-LABEL: {{^}}v_fneg_fsub_f32: +; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] +define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %a = load float, float addrspace(1)* %in, align 4 + %b = load float, float addrspace(1)* %b_ptr, align 4 + %result = fsub float %a, %b + %neg.result = fsub float -0.0, %result + store float %neg.result, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32: +; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI-NOT: xor +define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %a = load float, float addrspace(1)* %in, align 4 + %b = load float, float addrspace(1)* %b_ptr, align 4 + %result = fsub nsz float %a, %b + %neg.result = fsub float -0.0, %result + store float %neg.result, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32: +; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI-NOT: xor +define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %a = load float, float addrspace(1)* %in, align 4 + %b = load float, float addrspace(1)* %b_ptr, align 4 + %result = fsub float %a, %b + %neg.result = fsub float -0.0, %result + store float %neg.result, float addrspace(1)* %out, align 4 + ret void +} + +; For some reason the attribute has a string "true" or "false", so +; make sure it is disabled and the fneg is not folded if it is not +; "true". +; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32: +; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] +define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 { + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %a = load float, float addrspace(1)* %in, align 4 + %b = load float, float addrspace(1)* %b_ptr, align 4 + %result = fsub float %a, %b + %neg.result = fsub float -0.0, %result + store float %neg.result, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_fsub_0_nsz_attribute_f32: +; SI-NOT: v_sub +define amdgpu_kernel void @v_fsub_0_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %a = load float, float addrspace(1)* %in, align 4 + %result = fsub float %a, 0.0 + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" } +attributes #1 = { nounwind "no-signed-zeros-fp-math"="false" } diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll index 4c9c5ddd4c6e..1b0879d098ee 100644 --- a/test/CodeGen/AMDGPU/fsub64.ll +++ b/test/CodeGen/AMDGPU/fsub64.ll @@ -5,7 +5,7 @@ declare double @llvm.fabs.f64(double) #0 ; SI-LABEL: {{^}}fsub_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -16,7 +16,7 @@ define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; SI-LABEL: {{^}}fsub_fabs_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} -define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -28,7 +28,7 @@ define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; SI-LABEL: {{^}}fsub_fabs_inv_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}} -define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define amdgpu_kernel void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 %r1 = load double, double addrspace(1)* %in2 @@ -40,7 +40,7 @@ define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* % ; SI-LABEL: {{^}}s_fsub_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { +define amdgpu_kernel void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { %sub = fsub double %a, %b store double %sub, double addrspace(1)* %out ret void @@ -48,7 +48,7 @@ define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { ; SI-LABEL: {{^}}s_fsub_imm_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}, 4.0 -define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { +define amdgpu_kernel void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { %sub = fsub double 4.0, %a store double %sub, double addrspace(1)* %out ret void @@ -56,7 +56,7 @@ define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { ; SI-LABEL: {{^}}s_fsub_imm_inv_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}, -4.0 -define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) { +define amdgpu_kernel void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) { %sub = fsub double %a, 4.0 store double %sub, double addrspace(1)* %out ret void @@ -64,7 +64,7 @@ define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) ; SI-LABEL: {{^}}s_fsub_self_f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) { +define amdgpu_kernel void @s_fsub_self_f64(double addrspace(1)* %out, double %a) { %sub = fsub double %a, %a store double %sub, double addrspace(1)* %out ret void @@ -73,7 +73,7 @@ define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) { ; SI-LABEL: {{^}}fsub_v2f64: ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) { +define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) { %sub = fsub <2 x double> %a, %b store <2 x double> %sub, <2 x double> addrspace(1)* %out ret void @@ -84,7 +84,7 @@ define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x d ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) { +define amdgpu_kernel void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) { %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 %a = load <4 x double>, <4 x double> addrspace(1)* %in %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr @@ -98,7 +98,7 @@ define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace( ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) { +define amdgpu_kernel void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) { %result = fsub <4 x double> %a, %b store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16 ret void diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll index c4138ad79c28..1f72ec65588e 100644 --- a/test/CodeGen/AMDGPU/ftrunc.f64.ll +++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -13,7 +13,7 @@ declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone ; CI: v_trunc_f64 ; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11 ; SI: s_endpgm -define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { %x = load double, double addrspace(1)* %in, align 8 %y = call double @llvm.trunc.f64(double %x) nounwind readnone store double %y, double addrspace(1)* %out, align 8 @@ -36,7 +36,7 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { ; SI-DAG: cndmask_b32 ; SI-DAG: cndmask_b32 ; SI: s_endpgm -define void @ftrunc_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @ftrunc_f64(double addrspace(1)* %out, double %x) { %y = call double @llvm.trunc.f64(double %x) nounwind readnone store double %y, double addrspace(1)* %out ret void @@ -45,7 +45,7 @@ define void @ftrunc_f64(double addrspace(1)* %out, double %x) { ; FUNC-LABEL: {{^}}ftrunc_v2f64: ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 -define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { +define amdgpu_kernel void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone store <2 x double> %y, <2 x double> addrspace(1)* %out ret void @@ -55,7 +55,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { ; FIXME-CI: v_trunc_f64_e32 ; FIXME-CI: v_trunc_f64_e32 ; FIXME-CI: v_trunc_f64_e32 -; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; define amdgpu_kernel void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { ; %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone ; store <3 x double> %y, <3 x double> addrspace(1)* %out ; ret void @@ -66,7 +66,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 -define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { +define amdgpu_kernel void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone store <4 x double> %y, <4 x double> addrspace(1)* %out ret void @@ -81,7 +81,7 @@ define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 -define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { +define amdgpu_kernel void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone store <8 x double> %y, <8 x double> addrspace(1)* %out ret void @@ -104,7 +104,7 @@ define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 ; CI: v_trunc_f64_e32 -define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { +define amdgpu_kernel void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone store <16 x double> %y, <16 x double> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/ftrunc.ll b/test/CodeGen/AMDGPU/ftrunc.ll index d0718394e7f1..b5ad01eaeaf0 100644 --- a/test/CodeGen/AMDGPU/ftrunc.ll +++ b/test/CodeGen/AMDGPU/ftrunc.ll @@ -12,7 +12,7 @@ declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone ; FUNC-LABEL: {{^}}ftrunc_f32: ; EG: TRUNC ; SI: v_trunc_f32_e32 -define void @ftrunc_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @ftrunc_f32(float addrspace(1)* %out, float %x) { %y = call float @llvm.trunc.f32(float %x) nounwind readnone store float %y, float addrspace(1)* %out ret void @@ -23,7 +23,7 @@ define void @ftrunc_f32(float addrspace(1)* %out, float %x) { ; EG: TRUNC ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 -define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { +define amdgpu_kernel void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone store <2 x float> %y, <2 x float> addrspace(1)* %out ret void @@ -36,7 +36,7 @@ define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { ; FIXME-SI: v_trunc_f32_e32 ; FIXME-SI: v_trunc_f32_e32 ; FIXME-SI: v_trunc_f32_e32 -; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { +; define amdgpu_kernel void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { ; %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone ; store <3 x float> %y, <3 x float> addrspace(1)* %out ; ret void @@ -51,7 +51,7 @@ define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 -define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { +define amdgpu_kernel void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone store <4 x float> %y, <4 x float> addrspace(1)* %out ret void @@ -74,7 +74,7 @@ define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 -define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { +define amdgpu_kernel void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone store <8 x float> %y, <8 x float> addrspace(1)* %out ret void @@ -113,7 +113,7 @@ define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 ; SI: v_trunc_f32_e32 -define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { +define amdgpu_kernel void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone store <16 x float> %y, <16 x float> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll index f96463613e8e..7fb47e08ea58 100644 --- a/test/CodeGen/AMDGPU/gep-address-space.ll +++ b/test/CodeGen/AMDGPU/gep-address-space.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s -define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { +define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { ; CHECK-LABEL: {{^}}use_gep_address_space: ; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}} ; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64 @@ -17,7 +17,7 @@ define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { ; SI: s_or_b32 ; CI: s_add_i32 ; CHECK: ds_write_b32 -define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { +define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384 store i32 99, i32 addrspace(3)* %p ret void @@ -39,7 +39,7 @@ define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %arra ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 ; CHECK: s_endpgm -define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { +define amdgpu_kernel void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0 %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1 @@ -60,7 +60,7 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64 ; CHECK: s_endpgm -define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { +define amdgpu_kernel void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0 %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1 diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll index 5a18d425d506..80acfcca7082 100644 --- a/test/CodeGen/AMDGPU/global-constant.ll +++ b/test/CodeGen/AMDGPU/global-constant.ll @@ -26,7 +26,7 @@ ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4 -define void @private_test(i32 %index, float addrspace(1)* %out) { +define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) { %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index %val = load float, float addrspace(2)* %ptr store float %val, float addrspace(1)* %out @@ -40,7 +40,7 @@ define void @private_test(i32 %index, float addrspace(1)* %out) { ; HSA: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}} ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4 -define void @available_externally_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1 %val = load i32, i32 addrspace(2)* %ptr store i32 %val, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/global-directive.ll b/test/CodeGen/AMDGPU/global-directive.ll index 450b7d367429..ce89e390eac1 100644 --- a/test/CodeGen/AMDGPU/global-directive.ll +++ b/test/CodeGen/AMDGPU/global-directive.ll @@ -5,7 +5,7 @@ ; SI: .globl foo ; SI: {{^}}foo: -define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll index 2c7c02de1673..19e592f50bea 100644 --- a/test/CodeGen/AMDGPU/global-extload-i16.ll +++ b/test/CodeGen/AMDGPU/global-extload-i16.ll @@ -7,7 +7,7 @@ ; SI: buffer_load_ushort ; SI: buffer_store_dword ; SI: s_endpgm -define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -18,7 +18,7 @@ define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; SI: buffer_load_sshort ; SI: buffer_store_dword ; SI: s_endpgm -define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -28,7 +28,7 @@ define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32: ; SI: buffer_load_ushort ; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -38,7 +38,7 @@ define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32: ; SI: buffer_load_sshort ; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -47,7 +47,7 @@ define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32: ; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -56,7 +56,7 @@ define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32: ; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -65,7 +65,7 @@ define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32: ; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -74,7 +74,7 @@ define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32: ; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -83,7 +83,7 @@ define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32: ; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -92,7 +92,7 @@ define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32: ; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -101,7 +101,7 @@ define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32: ; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -110,7 +110,7 @@ define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32: ; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -119,7 +119,7 @@ define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32: ; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -128,7 +128,7 @@ define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32: ; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -137,7 +137,7 @@ define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32: ; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -146,7 +146,7 @@ define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32: ; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -157,7 +157,7 @@ define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]], ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -168,7 +168,7 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) ; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 ; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] ; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 -define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -177,7 +177,7 @@ define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64: ; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -186,7 +186,7 @@ define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64: ; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -195,7 +195,7 @@ define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64: ; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -204,7 +204,7 @@ define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64: ; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -213,7 +213,7 @@ define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64: ; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -222,7 +222,7 @@ define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64: ; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -231,7 +231,7 @@ define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64: ; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -240,7 +240,7 @@ define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64: ; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -249,7 +249,7 @@ define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64: ; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -258,7 +258,7 @@ define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64: ; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -267,7 +267,7 @@ define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64: ; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -276,7 +276,7 @@ define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64: ; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -285,7 +285,7 @@ define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64: ; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = zext <64 x i16> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -294,7 +294,7 @@ define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64: ; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { +define amdgpu_kernel void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = sext <64 x i16> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/global-variable-relocs.ll b/test/CodeGen/AMDGPU/global-variable-relocs.ll index 00be6e4d5c15..ae6dd54fec6c 100644 --- a/test/CodeGen/AMDGPU/global-variable-relocs.ll +++ b/test/CodeGen/AMDGPU/global-variable-relocs.ll @@ -19,7 +19,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @private_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -33,7 +33,7 @@ define void @private_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @internal_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -50,7 +50,7 @@ define void @internal_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @available_externally_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -67,7 +67,7 @@ define void @available_externally_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @linkonce_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -84,7 +84,7 @@ define void @linkonce_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @weak_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -101,7 +101,7 @@ define void @weak_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @common_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -118,7 +118,7 @@ define void @common_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @extern_weak_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -135,7 +135,7 @@ define void @extern_weak_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @linkonce_odr_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -152,7 +152,7 @@ define void @linkonce_odr_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @weak_odr_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -169,7 +169,7 @@ define void @weak_odr_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @external_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out @@ -186,7 +186,7 @@ define void @external_test(i32 addrspace(1)* %out) { ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @external_w_init_test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr store i32 %val, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll index 909ceb5546c6..6928bede547e 100644 --- a/test/CodeGen/AMDGPU/global_atomics.ll +++ b/test/CodeGen/AMDGPU/global_atomics.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}atomic_add_i32_offset: ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst @@ -13,7 +13,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_add_i32_soffset: ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} -define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst @@ -25,7 +25,7 @@ entry: ; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd ; SI: buffer_atomic_add v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add -define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595 @@ -36,7 +36,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset: ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst @@ -47,7 +47,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -59,7 +59,7 @@ entry: ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -70,7 +70,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_add_i32: ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -79,7 +79,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_add_i32_ret: ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -89,7 +89,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_add_i32_addr64: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -100,7 +100,7 @@ entry: ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -110,7 +110,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_and_i32_offset: ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst @@ -120,7 +120,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset: ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst @@ -131,7 +131,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset: ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -143,7 +143,7 @@ entry: ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -154,7 +154,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_and_i32: ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -163,7 +163,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_and_i32_ret: ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -173,7 +173,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_and_i32_addr64: ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -184,7 +184,7 @@ entry: ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -194,7 +194,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_sub_i32_offset: ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst @@ -204,7 +204,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset: ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst @@ -215,7 +215,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset: ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -227,7 +227,7 @@ entry: ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -238,7 +238,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_sub_i32: ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -247,7 +247,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_sub_i32_ret: ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -257,7 +257,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64: ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -268,7 +268,7 @@ entry: ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -278,7 +278,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_max_i32_offset: ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst @@ -288,7 +288,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset: ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst @@ -299,7 +299,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset: ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -311,7 +311,7 @@ entry: ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -322,7 +322,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_max_i32: ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -331,7 +331,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_max_i32_ret: ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -341,7 +341,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_max_i32_addr64: ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -352,7 +352,7 @@ entry: ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -362,7 +362,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umax_i32_offset: ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst @@ -372,7 +372,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset: ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst @@ -383,7 +383,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -395,7 +395,7 @@ entry: ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -406,7 +406,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umax_i32: ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -415,7 +415,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umax_i32_ret: ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -425,7 +425,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -436,7 +436,7 @@ entry: ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -446,7 +446,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_min_i32_offset: ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst @@ -456,7 +456,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset: ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst @@ -467,7 +467,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -479,7 +479,7 @@ entry: ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -490,7 +490,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_min_i32: ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -499,7 +499,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_min_i32_ret: ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -509,7 +509,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_min_i32_addr64: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -520,7 +520,7 @@ entry: ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -530,7 +530,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umin_i32_offset: ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst @@ -540,7 +540,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset: ; GCN: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst @@ -551,7 +551,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -563,7 +563,7 @@ entry: ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -574,7 +574,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umin_i32: ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -583,7 +583,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umin_i32_ret: ; SI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -593,7 +593,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -604,7 +604,7 @@ entry: ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -614,7 +614,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_or_i32_offset: ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst @@ -624,7 +624,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset: ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst @@ -635,7 +635,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -647,7 +647,7 @@ entry: ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -658,7 +658,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_or_i32: ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -667,7 +667,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_or_i32_ret: ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -677,7 +677,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_or_i32_addr64: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -688,7 +688,7 @@ entry: ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -698,7 +698,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset: ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst @@ -708,7 +708,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset: ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst @@ -720,7 +720,7 @@ entry: ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}} -define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -733,7 +733,7 @@ entry: ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -744,7 +744,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xchg_i32: ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -753,7 +753,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret: ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -763,7 +763,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64: ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -774,7 +774,7 @@ entry: ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -784,7 +784,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset: ; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst @@ -794,7 +794,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset: ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword v[[RET]] -define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst @@ -807,7 +807,7 @@ entry: ; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -819,7 +819,7 @@ entry: ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword v[[RET]] -define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -831,7 +831,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32: ; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst ret void @@ -840,7 +840,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret: ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword v[[RET]] -define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst %extract0 = extractvalue { i32, i1 } %val, 0 @@ -851,7 +851,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64: ; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst @@ -862,7 +862,7 @@ entry: ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword v[[RET]] -define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst @@ -873,7 +873,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xor_i32_offset: ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst @@ -883,7 +883,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset: ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst @@ -894,7 +894,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -906,7 +906,7 @@ entry: ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -917,7 +917,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xor_i32: ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst ret void @@ -926,7 +926,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xor_i32_ret: ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst store i32 %val, i32 addrspace(1)* %out2 @@ -936,7 +936,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -947,7 +947,7 @@ entry: ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst @@ -959,7 +959,7 @@ entry: ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4 %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4 @@ -971,7 +971,7 @@ entry: ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: buffer_store_dword [[RET]] -define void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4 store i32 %val, i32 addrspace(1)* %out @@ -982,7 +982,7 @@ entry: ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -995,7 +995,7 @@ entry: ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dword [[RET]] -define void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index %val = load atomic i32, i32 addrspace(1)* %ptr seq_cst, align 4 @@ -1006,7 +1006,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_store_i32_offset: ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4 @@ -1016,7 +1016,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_store_i32: ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 ret void @@ -1025,7 +1025,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 @@ -1036,7 +1036,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_store_i32_addr64: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4 diff --git a/test/CodeGen/AMDGPU/global_atomics_i64.ll b/test/CodeGen/AMDGPU/global_atomics_i64.ll index f66c6c7b531a..56520b787ead 100644 --- a/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_offset: ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst @@ -13,7 +13,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset: ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst @@ -24,7 +24,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset: ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -36,7 +36,7 @@ entry: ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -47,7 +47,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64: ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -56,7 +56,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_ret: ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -66,7 +66,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64_addr64: ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -77,7 +77,7 @@ entry: ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -87,7 +87,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_offset: ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst @@ -97,7 +97,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset: ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst @@ -108,7 +108,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset: ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -120,7 +120,7 @@ entry: ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -131,7 +131,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64: ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -140,7 +140,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_ret: ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -150,7 +150,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_addr64: ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -161,7 +161,7 @@ entry: ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -171,7 +171,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_offset: ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst @@ -181,7 +181,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset: ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst @@ -192,7 +192,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset: ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -204,7 +204,7 @@ entry: ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -215,7 +215,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64: ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -224,7 +224,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_ret: ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -234,7 +234,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_addr64: ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -245,7 +245,7 @@ entry: ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -255,7 +255,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_offset: ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst @@ -265,7 +265,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset: ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst @@ -276,7 +276,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset: ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -288,7 +288,7 @@ entry: ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -299,7 +299,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64: ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -308,7 +308,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_ret: ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -318,7 +318,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_addr64: ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -329,7 +329,7 @@ entry: ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -339,7 +339,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_offset: ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst @@ -349,7 +349,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset: ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst @@ -360,7 +360,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset: ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -372,7 +372,7 @@ entry: ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -383,7 +383,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64: ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -392,7 +392,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_ret: ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -402,7 +402,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_addr64: ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -413,7 +413,7 @@ entry: ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -423,7 +423,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_offset: ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst @@ -433,7 +433,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset: ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst @@ -444,7 +444,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset: ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -456,7 +456,7 @@ entry: ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -467,7 +467,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64: ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -476,7 +476,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_ret: ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -486,7 +486,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_addr64: ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -497,7 +497,7 @@ entry: ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -507,7 +507,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_offset: ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst @@ -517,7 +517,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset: ; GCN: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst @@ -528,7 +528,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset: ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -540,7 +540,7 @@ entry: ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -551,7 +551,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64: ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -560,7 +560,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_ret: ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -570,7 +570,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_addr64: ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -581,7 +581,7 @@ entry: ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -591,7 +591,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_offset: ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst @@ -601,7 +601,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset: ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst @@ -612,7 +612,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset: ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -624,7 +624,7 @@ entry: ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -635,7 +635,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64: ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -644,7 +644,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_ret: ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -654,7 +654,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_addr64: ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -665,7 +665,7 @@ entry: ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -675,7 +675,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_offset: ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst @@ -685,7 +685,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset: ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst @@ -696,7 +696,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset: ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -define void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -708,7 +708,7 @@ entry: ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -719,7 +719,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64: ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -728,7 +728,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_ret: ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -738,7 +738,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64: ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -749,7 +749,7 @@ entry: ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -759,7 +759,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_offset: ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst @@ -769,7 +769,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset: ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst @@ -780,7 +780,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset: ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -792,7 +792,7 @@ entry: ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -803,7 +803,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64: ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst ret void @@ -812,7 +812,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_ret: ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst store i64 %tmp0, i64 addrspace(1)* %out2 @@ -822,7 +822,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_addr64: ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -define void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -833,7 +833,7 @@ entry: ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { +define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst @@ -851,7 +851,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_offset: ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -define void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -861,7 +861,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_soffset: ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x11940 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} -define void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000 %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -871,7 +871,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset: ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst @@ -884,7 +884,7 @@ entry: ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -896,7 +896,7 @@ entry: ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -908,7 +908,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64: ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst ret void @@ -917,7 +917,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret: ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 @@ -928,7 +928,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64: ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -define void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst @@ -939,7 +939,7 @@ entry: ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]: -define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst @@ -952,7 +952,7 @@ entry: ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { entry: %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4 %val = load atomic i64, i64 addrspace(1)* %gep seq_cst, align 8 @@ -964,7 +964,7 @@ entry: ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { entry: %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8 store i64 %val, i64 addrspace(1)* %out @@ -975,7 +975,7 @@ entry: ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -988,7 +988,7 @@ entry: ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; GCN: buffer_store_dwordx2 [[RET]] -define void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index %val = load atomic i64, i64 addrspace(1)* %ptr seq_cst, align 8 @@ -999,7 +999,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_store_i64_offset: ; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -define void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 store atomic i64 %in, i64 addrspace(1)* %gep seq_cst, align 8 @@ -1009,7 +1009,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_store_i64: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc -define void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) { +define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) { entry: store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8 ret void @@ -1018,7 +1018,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_store_i64_addr64_offset: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} -define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 @@ -1029,7 +1029,7 @@ entry: ; FUNC-LABEL: {{^}}atomic_store_i64_addr64: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} -define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index store atomic i64 %in, i64 addrspace(1)* %ptr seq_cst, align 8 diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll index d07843e9dd27..0903542bac4f 100644 --- a/test/CodeGen/AMDGPU/gv-const-addrspace.ll +++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll @@ -15,7 +15,7 @@ ; EG: @float_gv ; EG-NOT: MOVA_INT ; EG-NOT: MOV -define void @float(float addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) { entry: %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index %1 = load float, float addrspace(2)* %0 @@ -33,7 +33,7 @@ entry: ; EG: @i32_gv ; EG-NOT: MOVA_INT ; EG-NOT: MOV -define void @i32(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) { entry: %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index %1 = load i32, i32 addrspace(2)* %0 @@ -53,7 +53,7 @@ entry: ; EG: @struct_foo_gv ; EG-NOT: MOVA_INT ; EG-NOT: MOV -define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index %load = load i32, i32 addrspace(2)* %gep, align 4 store i32 %load, i32 addrspace(1)* %out, align 4 @@ -72,7 +72,7 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { ; EG: @array_v1_gv ; EG-NOT: MOVA_INT ; EG-NOT: MOV -define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4 store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4 @@ -84,7 +84,7 @@ define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { ; EG: VTX_READ_32 ; EG: @float_gv ; EG-NOT: MOVA_INT -define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { +define amdgpu_kernel void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { entry: %0 = icmp eq i32 0, %a br i1 %0, label %if, label %else diff --git a/test/CodeGen/AMDGPU/gv-offset-folding.ll b/test/CodeGen/AMDGPU/gv-offset-folding.ll index af5ee8e66750..e641d7266a79 100644 --- a/test/CodeGen/AMDGPU/gv-offset-folding.ll +++ b/test/CodeGen/AMDGPU/gv-offset-folding.ll @@ -12,8 +12,8 @@ ; for local memory globals. ; CHECK-LABEL: lds_no_offset: -; CHECK ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4 -define void @lds_no_offset() { +; CHECK: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4 +define amdgpu_kernel void @lds_no_offset() { entry: %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1 store i32 0, i32 addrspace(3)* %ptr diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll index aa22e83fade2..41ae5a4a0b00 100644 --- a/test/CodeGen/AMDGPU/half.ll +++ b/test/CodeGen/AMDGPU/half.ll @@ -8,7 +8,7 @@ ; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] ; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]] ; GCN: buffer_store_short [[CVT]] -define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { +define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { store half %arg, half addrspace(1)* %out ret void } @@ -20,7 +20,7 @@ define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]] ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm -define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { +define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { store <2 x half> %arg, <2 x half> addrspace(1)* %out ret void } @@ -34,7 +34,7 @@ define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { ; GCN-DAG: buffer_store_short ; GCN-NOT: buffer_store ; GCN: s_endpgm -define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { +define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { store <3 x half> %arg, <3 x half> addrspace(1)* %out ret void } @@ -46,33 +46,33 @@ define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { ; GCN: buffer_load_ushort ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { +define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}load_v8f16_arg: -define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { +define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { store <8 x half> %arg, <8 x half> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}extload_v2f16_arg: -define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { +define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { %fpext = fpext <2 x half> %in to <2 x float> store <2 x float> %fpext, <2 x float> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}extload_f16_to_f32_arg: -define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { +define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { %ext = fpext half %arg to float store float %ext, float addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: -define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { +define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { %ext = fpext <2 x half> %arg to <2 x float> store <2 x float> %ext, <2 x float> addrspace(1)* %out ret void @@ -90,14 +90,14 @@ define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x hal ; GCN-DAG: buffer_store_dword ; GCN-DAG: buffer_store_dwordx2 ; GCN: s_endpgm -define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { +define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { %ext = fpext <3 x half> %arg to <3 x float> store <3 x float> %ext, <3 x float> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: -define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { +define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { %ext = fpext <4 x half> %arg to <4 x float> store <4 x float> %ext, <4 x float> addrspace(1)* %out ret void @@ -124,7 +124,7 @@ define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x hal ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { +define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { %ext = fpext <8 x half> %arg to <8 x float> store <8 x float> %ext, <8 x float> addrspace(1)* %out ret void @@ -138,7 +138,7 @@ define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x hal ; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]] ; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]] ; GCN: buffer_store_dwordx2 [[RESULT]] -define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { +define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { %ext = fpext half %arg to double store double %ext, double addrspace(1)* %out ret void @@ -152,7 +152,7 @@ define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN: s_endpgm -define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { +define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { %ext = fpext <2 x half> %arg to <2 x double> store <2 x double> %ext, <2 x double> addrspace(1)* %out ret void @@ -169,7 +169,7 @@ define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x ha ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN: s_endpgm -define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { +define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { %ext = fpext <3 x half> %arg to <3 x double> store <3 x double> %ext, <3 x double> addrspace(1)* %out ret void @@ -189,7 +189,7 @@ define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x ha ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN: s_endpgm -define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { +define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { %ext = fpext <4 x half> %arg to <4 x double> store <4 x double> %ext, <4 x double> addrspace(1)* %out ret void @@ -227,7 +227,7 @@ define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x ha ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN: s_endpgm -define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { +define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { %ext = fpext <8 x half> %arg to <8 x double> store <8 x double> %ext, <8 x double> addrspace(1)* %out ret void @@ -236,7 +236,7 @@ define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x ha ; GCN-LABEL: {{^}}global_load_store_f16: ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] ; GCN: buffer_store_short [[TMP]] -define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in store half %val, half addrspace(1)* %out ret void @@ -245,7 +245,7 @@ define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* % ; GCN-LABEL: {{^}}global_load_store_v2f16: ; GCN: buffer_load_dword [[TMP:v[0-9]+]] ; GCN: buffer_store_dword [[TMP]] -define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in store <2 x half> %val, <2 x half> addrspace(1)* %out ret void @@ -254,7 +254,7 @@ define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> a ; GCN-LABEL: {{^}}global_load_store_v4f16: ; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx2 [[TMP]] -define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { +define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { %val = load <4 x half>, <4 x half> addrspace(1)* %in store <4 x half> %val, <4 x half> addrspace(1)* %out ret void @@ -264,7 +264,7 @@ define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> ad ; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] ; GCN: s_endpgm -define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { %val = load <8 x half>, <8 x half> addrspace(1)* %in store <8 x half> %val, <8 x half> addrspace(1)* %out ret void @@ -274,7 +274,7 @@ define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> a ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] ; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] ; GCN: buffer_store_dword [[CVT]] -define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in %cvt = fpext half %val to float store float %cvt, float addrspace(1)* %out @@ -283,13 +283,13 @@ define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace( ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: ; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; VI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] ; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] -; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] -; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] +; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] +; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} ; GCN: s_endpgm -define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %cvt = fpext <2 x half> %val to <2 x float> store <2 x float> %cvt, <2 x float> addrspace(1)* %out @@ -297,7 +297,7 @@ define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x } ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: -define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in %cvt = fpext <3 x half> %val to <3 x float> store <3 x float> %cvt, <3 x float> addrspace(1)* %out @@ -305,7 +305,7 @@ define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x } ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: -define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { %val = load <4 x half>, <4 x half> addrspace(1)* %in %cvt = fpext <4 x half> %val to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out @@ -313,7 +313,7 @@ define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x } ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: -define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { %val = load <8 x half>, <8 x half> addrspace(1)* %in %cvt = fpext <8 x half> %val to <8 x float> store <8 x float> %cvt, <8 x float> addrspace(1)* %out @@ -324,22 +324,26 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x ; GCN: buffer_load_dwordx4 ; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 + +; VI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa +; ... ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 @@ -347,7 +351,7 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { %val = load <16 x half>, <16 x half> addrspace(1)* %in %cvt = fpext <16 x half> %val to <16 x float> store <16 x float> %cvt, <16 x float> addrspace(1)* %out @@ -359,7 +363,7 @@ define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <1 ; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] ; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] ; GCN: buffer_store_dwordx2 [[CVT1]] -define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in %cvt = fpext half %val to double store double %cvt, double addrspace(1)* %out @@ -368,14 +372,21 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: ; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] -; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] -; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] + +; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; SI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] +; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] +; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] + +; VI-DAG: v_cvt_f32_f16_sdwa v[[CVT0:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD]] +; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT0]] +; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT1]] + ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} ; GCN: s_endpgm -define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %cvt = fpext <2 x half> %val to <2 x double> store <2 x double> %cvt, <2 x double> addrspace(1)* %out @@ -392,28 +403,27 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x ; XSI-NOT: v_cvt_f32_f16 ; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} -; XVI: v_cvt_f32_f16_e32 ; XVI: v_cvt_f32_f16_e32 ; XVI: v_cvt_f32_f16_e32 +; XVI: v_cvt_f32_f16_sdwa ; XVI-NOT: v_cvt_f32_f16 ; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] -; VI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] -; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] -; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] - -; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] -; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] -; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] +; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] +; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] +; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] +; SI-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] +; VI-DAG: v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 + +; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] +; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] ; GCN-NOT: v_cvt_f64_f32_e32 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 ; GCN: s_endpgm -define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in %cvt = fpext <3 x half> %val to <3 x double> store <3 x double> %cvt, <3 x double> addrspace(1)* %out @@ -421,7 +431,7 @@ define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x } ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: -define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { %val = load <4 x half>, <4 x half> addrspace(1)* %in %cvt = fpext <4 x half> %val to <4 x double> store <4 x double> %cvt, <4 x double> addrspace(1)* %out @@ -429,7 +439,7 @@ define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x } ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: -define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { %val = load <8 x half>, <8 x half> addrspace(1)* %in %cvt = fpext <8 x half> %val to <8 x double> store <8 x double> %cvt, <8 x double> addrspace(1)* %out @@ -437,7 +447,7 @@ define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x } ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: -define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { %val = load <16 x half>, <16 x half> addrspace(1)* %in %cvt = fpext <16 x half> %val to <16 x double> store <16 x double> %cvt, <16 x double> addrspace(1)* %out @@ -448,7 +458,7 @@ define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, < ; GCN: buffer_load_dword [[LOAD:v[0-9]+]] ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] ; GCN: buffer_store_short [[CVT]] -define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { %val = load float, float addrspace(1)* %in %cvt = fptrunc float %val to half store half %cvt, half addrspace(1)* %out @@ -458,12 +468,17 @@ define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspa ; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] -; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] -; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] -; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] + +; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] +; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] +; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] + +; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]] + ; GCN-DAG: buffer_store_dword [[PACKED]] ; GCN: s_endpgm -define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { %val = load <2 x float>, <2 x float> addrspace(1)* %in %cvt = fptrunc <2 x float> %val to <2 x half> store <2 x half> %cvt, <2 x half> addrspace(1)* %out @@ -472,14 +487,14 @@ define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: ; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-NOT: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; SI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; GCN-DAG: v_cvt_f16_f32_e32 ; GCN: buffer_store_short ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { %val = load <3 x float>, <3 x float> addrspace(1)* %in %cvt = fptrunc <3 x float> %val to <3 x half> store <3 x half> %cvt, <3 x half> addrspace(1)* %out @@ -488,13 +503,15 @@ define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 ; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: ; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; SI-DAG: v_cvt_f16_f32_e32 +; SI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa +; GCN-DAG: v_cvt_f16_f32_e32 ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm -define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %val = load <4 x float>, <4 x float> addrspace(1)* %in %cvt = fptrunc <4 x float> %val to <4 x half> store <4 x half> %cvt, <4 x half> addrspace(1)* %out @@ -504,17 +521,25 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: ; GCN: buffer_load_dwordx4 ; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { %val = load <8 x float>, <8 x float> addrspace(1)* %in %cvt = fptrunc <8 x float> %val to <8 x half> store <8 x half> %cvt, <8 x half> addrspace(1)* %out @@ -545,7 +570,7 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 ; GCN-DAG: buffer_store_dwordx4 ; GCN-DAG: buffer_store_dwordx4 ; GCN: s_endpgm -define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in %cvt = fptrunc <16 x float> %val to <16 x half> store <16 x half> %cvt, <16 x half> addrspace(1)* %out @@ -560,7 +585,7 @@ define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, ; SI: v_add_f32 ; GCN: s_endpgm -define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { +define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { %add = fadd half %a, %b store half %add, half addrspace(1)* %out, align 4 ret void @@ -570,7 +595,7 @@ define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { ; SI: v_add_f32 ; SI: v_add_f32 ; GCN: s_endpgm -define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { +define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { %add = fadd <2 x half> %a, %b store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 ret void @@ -582,7 +607,7 @@ define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> ; SI: v_add_f32 ; SI: v_add_f32 ; GCN: s_endpgm -define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 @@ -601,7 +626,7 @@ define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* ; SI: v_add_f32 ; SI: v_add_f32 ; GCN: s_endpgm -define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { +define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { %add = fadd <8 x half> %a, %b store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 ret void @@ -610,7 +635,7 @@ define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> ; GCN-LABEL: {{^}}test_bitcast_from_half: ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] ; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { %val = load half, half addrspace(1)* %in %val_int = bitcast half %val to i16 store i16 %val_int, i16 addrspace(1)* %out @@ -620,7 +645,7 @@ define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %o ; GCN-LABEL: {{^}}test_bitcast_to_half: ; GCN: buffer_load_ushort [[TMP:v[0-9]+]] ; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %val = load i16, i16 addrspace(1)* %in %val_fp = bitcast i16 %val to half store half %val_fp, half addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/hsa-default-device.ll b/test/CodeGen/AMDGPU/hsa-default-device.ll index 631d6def4442..45efe9b86557 100644 --- a/test/CodeGen/AMDGPU/hsa-default-device.ll +++ b/test/CodeGen/AMDGPU/hsa-default-device.ll @@ -4,7 +4,7 @@ ; unsupported device. ; CHECK: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" -define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { +define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { store float 0.0, float addrspace(1)* %out0 ret void } diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll index 51d6aee25f45..b1901cf894b0 100644 --- a/test/CodeGen/AMDGPU/hsa-fp-mode.ll +++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll @@ -4,7 +4,7 @@ ; GCN: float_mode = 192 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 { +define amdgpu_kernel void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -14,7 +14,7 @@ define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %ou ; GCN: float_mode = 192 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 { +define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -24,7 +24,7 @@ define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %ou ; GCN: float_mode = 192 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 { +define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -34,7 +34,7 @@ define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* ; GCN: float_mode = 48 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 { +define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -44,7 +44,7 @@ define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* ; GCN: float_mode = 240 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 { +define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -54,7 +54,17 @@ define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace( ; GCN: float_mode = 0 ; GCN: enable_dx10_clamp = 1 ; GCN: enable_ieee_mode = 1 -define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 { +define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 { + store float 0.0, float addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}test_no_dx10_clamp_vi: +; GCN: float_mode = 192 +; GCN: enable_dx10_clamp = 0 +; GCN: enable_ieee_mode = 1 +define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 { store float 0.0, float addrspace(1)* %out0 store double 0.0, double addrspace(1)* %out1 ret void @@ -62,7 +72,8 @@ define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* % attributes #0 = { nounwind "target-cpu"="kaveri" } attributes #1 = { nounwind "target-cpu"="fiji" } -attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-denormals" } -attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-denormals" } -attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" } -attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" } +attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" } +attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" } +attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" } +attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" } +attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" } diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll index d9662b69b126..b4cdd4030d86 100644 --- a/test/CodeGen/AMDGPU/hsa-func.ll +++ b/test/CodeGen/AMDGPU/hsa-func.ll @@ -26,7 +26,7 @@ ; ELF: Symbol { ; ELF: Name: simple -; ELF: Size: 288 +; ELF: Size: 292 ; ELF: Type: Function (0x2) ; ELF: } diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll index 2820b308edb8..2ec57a40f0a2 100644 --- a/test/CodeGen/AMDGPU/hsa-globals.ll +++ b/test/CodeGen/AMDGPU/hsa-globals.ll @@ -9,7 +9,7 @@ @internal_readonly = internal unnamed_addr addrspace(2) constant i32 0 @external_readonly = unnamed_addr addrspace(2) constant i32 0 -define void @test() { +define amdgpu_kernel void @test() { ret void } diff --git a/test/CodeGen/AMDGPU/hsa-group-segment.ll b/test/CodeGen/AMDGPU/hsa-group-segment.ll index 1999dc38a6b0..600793810e59 100644 --- a/test/CodeGen/AMDGPU/hsa-group-segment.ll +++ b/test/CodeGen/AMDGPU/hsa-group-segment.ll @@ -3,7 +3,7 @@ @internal_group = internal addrspace(3) global i32 undef @external_group = addrspace(3) global i32 undef -define void @test() { +define amdgpu_kernel void @test() { entry: store i32 0, i32 addrspace(3)* @internal_group store i32 0, i32 addrspace(3)* @external_group diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll index a4e599230b74..af63a4f8df76 100644 --- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll +++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll @@ -13,6 +13,8 @@ ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx804 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI804 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx901 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX901 %s ; HSA: .hsa_code_object_version 2,1 ; HSA-CI700: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" @@ -24,3 +26,5 @@ ; HSA-VI803: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU" ; HSA-VI804: .hsa_code_object_isa 8,0,4,"AMD","AMDGPU" ; HSA-VI810: .hsa_code_object_isa 8,1,0,"AMD","AMDGPU" +; HSA-GFX900: .hsa_code_object_isa 9,0,0,"AMD","AMDGPU" +; HSA-GFX901: .hsa_code_object_isa 9,0,1,"AMD","AMDGPU" diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll index 12c15441c0f5..972fbd66ef37 100644 --- a/test/CodeGen/AMDGPU/hsa.ll +++ b/test/CodeGen/AMDGPU/hsa.ll @@ -45,6 +45,8 @@ ; HSA: .amd_kernel_code_t ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: wavefront_size = 6 +; HSA: call_convention = -1 ; HSA: .end_amd_kernel_code_t ; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 diff --git a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll index e85db65e7429..f6bf0b09486e 100644 --- a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll +++ b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll @@ -5,7 +5,7 @@ ; SI-LABEL: {{^}}br_implicit_def: ; SI: BB#0: ; SI-NEXT: s_cbranch_scc1 -define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 { +define amdgpu_kernel void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 { bb: br i1 undef, label %bb1, label %bb2 diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll index d4912776debd..b160af86a2b6 100644 --- a/test/CodeGen/AMDGPU/i1-copy-phi.ll +++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -10,7 +10,7 @@ ; SI: s_and_saveexec_b64 ; SI: s_xor_b64 ; SI: s_endpgm -define void @br_i1_phi(i32 %arg) { +define amdgpu_kernel void @br_i1_phi(i32 %arg) { bb: %tidig = call i32 @llvm.r600.read.tidig.x() #0 %cmp = trunc i32 %tidig to i1 diff --git a/test/CodeGen/AMDGPU/i8-to-double-to-float.ll b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll index c218e1918bb0..d501be5c8bf0 100644 --- a/test/CodeGen/AMDGPU/i8-to-double-to-float.ll +++ b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll @@ -2,7 +2,7 @@ ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) { %1 = load i8, i8 addrspace(1)* %in %2 = uitofp i8 %1 to double %3 = fptrunc double %2 to float diff --git a/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll index 60e59a5a5286..12cc440e48d9 100644 --- a/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll +++ b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll @@ -6,7 +6,7 @@ ;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;CHECK-NOT: SETNE_INT -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = load i32, i32 addrspace(1)* %in %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/icmp.i16.ll b/test/CodeGen/AMDGPU/icmp.i16.ll index c3dad2d32033..99c2138bbe64 100644 --- a/test/CodeGen/AMDGPU/icmp.i16.ll +++ b/test/CodeGen/AMDGPU/icmp.i16.ll @@ -8,7 +8,7 @@ ; GCN-LABEL: {{^}}i16_eq: ; VI: v_cmp_eq_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -26,7 +26,7 @@ entry: ; GCN-LABEL: {{^}}i16_ne: ; VI: v_cmp_ne_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -44,7 +44,7 @@ entry: ; GCN-LABEL: {{^}}i16_ugt: ; VI: v_cmp_gt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -62,7 +62,7 @@ entry: ; GCN-LABEL: {{^}}i16_uge: ; VI: v_cmp_ge_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -80,7 +80,7 @@ entry: ; GCN-LABEL: {{^}}i16_ult: ; VI: v_cmp_lt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -98,7 +98,7 @@ entry: ; GCN-LABEL: {{^}}i16_ule: ; VI: v_cmp_le_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -117,7 +117,7 @@ entry: ; GCN-LABEL: {{^}}i16_sgt: ; VI: v_cmp_gt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -135,7 +135,7 @@ entry: ; GCN-LABEL: {{^}}i16_sge: ; VI: v_cmp_ge_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -153,7 +153,7 @@ entry: ; GCN-LABEL: {{^}}i16_slt: ; VI: v_cmp_lt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -171,7 +171,7 @@ entry: ; GCN-LABEL: {{^}}i16_sle: ; VI: v_cmp_le_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -190,7 +190,7 @@ entry: ; GCN-LABEL: {{^}}i16_eq_v_s: ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -206,7 +206,7 @@ entry: ; GCN-LABEL: {{^}}i16_ne_v_s: ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -222,7 +222,7 @@ entry: ; GCN-LABEL: {{^}}i16_ugt_v_s: ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -238,7 +238,7 @@ entry: ; GCN-LABEL: {{^}}i16_uge_v_s: ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -254,7 +254,7 @@ entry: ; GCN-LABEL: {{^}}i16_ult_v_s: ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -270,7 +270,7 @@ entry: ; GCN-LABEL: {{^}}i16_ule_v_s: ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -286,7 +286,7 @@ entry: ; GCN-LABEL: {{^}}i16_sgt_v_s: ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -302,7 +302,7 @@ entry: ; GCN-LABEL: {{^}}i16_sge_v_s: ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -318,7 +318,7 @@ entry: ; GCN-LABEL: {{^}}i16_slt_v_s: ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -334,7 +334,7 @@ entry: ; GCN-LABEL: {{^}}i16_sle_v_s: ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { +define amdgpu_kernel void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/test/CodeGen/AMDGPU/icmp64.ll b/test/CodeGen/AMDGPU/icmp64.ll index 33ad0c9199b9..3af74277df12 100644 --- a/test/CodeGen/AMDGPU/icmp64.ll +++ b/test/CodeGen/AMDGPU/icmp64.ll @@ -3,7 +3,7 @@ ; SI-LABEL: {{^}}test_i64_eq: ; SI: v_cmp_eq_u64 -define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp eq i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -12,7 +12,7 @@ define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: {{^}}test_i64_ne: ; SI: v_cmp_ne_u64 -define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ne i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -21,7 +21,7 @@ define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: {{^}}test_i64_slt: ; SI: v_cmp_lt_i64 -define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp slt i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -30,7 +30,7 @@ define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: {{^}}test_i64_ult: ; SI: v_cmp_lt_u64 -define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ult i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -39,7 +39,7 @@ define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: {{^}}test_i64_sle: ; SI: v_cmp_le_i64 -define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp sle i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -48,7 +48,7 @@ define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: {{^}}test_i64_ule: ; SI: v_cmp_le_u64 -define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ule i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -57,7 +57,7 @@ define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: {{^}}test_i64_sgt: ; SI: v_cmp_gt_i64 -define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp sgt i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -66,7 +66,7 @@ define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: {{^}}test_i64_ugt: ; SI: v_cmp_gt_u64 -define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp ugt i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -75,7 +75,7 @@ define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: {{^}}test_i64_sge: ; SI: v_cmp_ge_i64 -define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp sge i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -84,7 +84,7 @@ define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: {{^}}test_i64_uge: ; SI: v_cmp_ge_u64 -define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %cmp = icmp uge i64 %a, %b %result = sext i1 %cmp to i32 store i32 %result, i32 addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll new file mode 100644 index 000000000000..6e411ce5e017 --- /dev/null +++ b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll @@ -0,0 +1,45 @@ +; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s + +; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy +; GCN: ; illegal copy v1 to s9 + +define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_i32() #0 { + %vgpr = call i32 asm sideeffect "; def $0", "=${VGPR1}"() + call void asm sideeffect "; use $0", "${SGPR9}"(i32 %vgpr) + ret void +} + +; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy +; GCN: ; illegal copy v[0:1] to s[10:11] +define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v2i32() #0 { + %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1}"() + call void asm sideeffect "; use $0", "${SGPR10_SGPR11}"(<2 x i32> %vgpr) + ret void +} + +; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_v4i32 void (): illegal SGPR to VGPR copy +; GCN: ; illegal copy v[0:3] to s[8:11] +define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v4i32() #0 { + %vgpr = call <4 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3}"() + call void asm sideeffect "; use $0", "${SGPR8_SGPR9_SGPR10_SGPR11}"(<4 x i32> %vgpr) + ret void +} + +; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_v8i32 void (): illegal SGPR to VGPR copy +; GCN: ; illegal copy v[0:7] to s[8:15] +define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v8i32() #0 { + %vgpr = call <8 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}"() + call void asm sideeffect "; use $0", "${SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}"(<8 x i32> %vgpr) + ret void +} + +; ERR error: :0:0: in function illegal_vgpr_to_sgpr_copy_v16i32 void (): illegal SGPR to VGPR copy +; GCN: ; illegal copy v[0:15] to s[16:31] +define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 { + %vgpr = call <16 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}"() + call void asm sideeffect "; use $0", "${SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23_SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}"(<16 x i32> %vgpr) + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/image-attributes.ll b/test/CodeGen/AMDGPU/image-attributes.ll index 5906b2f15709..53d61e66c6ba 100644 --- a/test/CodeGen/AMDGPU/image-attributes.ll +++ b/test/CodeGen/AMDGPU/image-attributes.ll @@ -7,7 +7,7 @@ ; FUNC-LABEL: {{^}}width_2d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[2].Z -define void @width_2d (%opencl.image2d_t addrspace(1)* %in, +define amdgpu_kernel void @width_2d (%opencl.image2d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d( @@ -20,7 +20,7 @@ entry: ; FUNC-LABEL: {{^}}width_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[2].Z -define void @width_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @width_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d( @@ -37,7 +37,7 @@ entry: ; FUNC-LABEL: {{^}}height_2d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[2].W -define void @height_2d (%opencl.image2d_t addrspace(1)* %in, +define amdgpu_kernel void @height_2d (%opencl.image2d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d( @@ -50,7 +50,7 @@ entry: ; FUNC-LABEL: {{^}}height_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[2].W -define void @height_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @height_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d( @@ -67,7 +67,7 @@ entry: ; FUNC-LABEL: {{^}}depth_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].X -define void @depth_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @depth_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d( @@ -84,7 +84,7 @@ entry: ; FUNC-LABEL: {{^}}data_type_2d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].Y -define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in, +define amdgpu_kernel void @data_type_2d (%opencl.image2d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d( @@ -97,7 +97,7 @@ entry: ; FUNC-LABEL: {{^}}data_type_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].Y -define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @data_type_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d( @@ -114,7 +114,7 @@ entry: ; FUNC-LABEL: {{^}}channel_order_2d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].Z -define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in, +define amdgpu_kernel void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d( @@ -127,7 +127,7 @@ entry: ; FUNC-LABEL: {{^}}channel_order_3d: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[3].Z -define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in, +define amdgpu_kernel void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d( @@ -146,7 +146,7 @@ entry: ; FUNC-LABEL: {{^}}image_arg_2nd: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV * [[VAL]], KC0[4].Z -define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1, +define amdgpu_kernel void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1, i32 %x, %opencl.image2d_t addrspace(1)* %in2, i32 addrspace(1)* %out) { diff --git a/test/CodeGen/AMDGPU/image-resource-id.ll b/test/CodeGen/AMDGPU/image-resource-id.ll index d4cf34944240..dac7c7ddaeac 100644 --- a/test/CodeGen/AMDGPU/image-resource-id.ll +++ b/test/CodeGen/AMDGPU/image-resource-id.ll @@ -7,7 +7,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only +define amdgpu_kernel void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( @@ -21,7 +21,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only +define amdgpu_kernel void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( @@ -37,7 +37,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only +define amdgpu_kernel void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d( @@ -51,7 +51,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only +define amdgpu_kernel void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d( @@ -67,7 +67,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only %opencl.image2d_t addrspace(1)* %in2, ; read_only i32 addrspace(1)* %out) { entry: @@ -82,7 +82,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only %opencl.image2d_t addrspace(1)* %in2, ; read_only i32 addrspace(1)* %out) { entry: @@ -97,7 +97,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only %opencl.image3d_t addrspace(1)* %in2, ; read_only i32 addrspace(1)* %out) { entry: @@ -112,7 +112,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only %opencl.image3d_t addrspace(1)* %in2, ; read_only i32 addrspace(1)* %out) { entry: @@ -129,7 +129,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; write_only i32 addrspace(1)* %out) { entry: @@ -144,7 +144,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; write_only i32 addrspace(1)* %out) { entry: @@ -159,7 +159,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; write_only i32 addrspace(1)* %out) { entry: @@ -174,7 +174,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; write_only i32 addrspace(1)* %out) { entry: @@ -191,7 +191,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only %opencl.image3d_t addrspace(1)* %in2, ; read_only %opencl.image2d_t addrspace(1)* %in3, ; read_only i32 addrspace(1)* %out) { @@ -208,7 +208,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only +define amdgpu_kernel void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only %opencl.image2d_t addrspace(1)* %in2, ; read_only %opencl.image3d_t addrspace(1)* %in3, ; read_only i32 addrspace(1)* %out) { @@ -226,7 +226,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; write_only %opencl.image2d_t addrspace(1)* %in3, ; write_only i32 addrspace(1)* %out) { @@ -243,7 +243,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; write_only %opencl.image3d_t addrspace(1)* %in3, ; write_only i32 addrspace(1)* %out) { @@ -261,7 +261,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; read_only %opencl.image2d_t addrspace(1)* %in3, ; read_only i32 addrspace(1)* %out) { @@ -277,7 +277,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; read_only %opencl.image3d_t addrspace(1)* %in3, ; read_only i32 addrspace(1)* %out) { @@ -293,7 +293,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only %opencl.image3d_t addrspace(1)* %in2, ; read_only %opencl.image2d_t addrspace(1)* %in3, ; write_only i32 addrspace(1)* %out) { @@ -309,7 +309,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only +define amdgpu_kernel void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only %opencl.image2d_t addrspace(1)* %in2, ; read_only %opencl.image3d_t addrspace(1)* %in3, ; write_only i32 addrspace(1)* %out) { diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll index ef6008aa5fde..c2668a077b09 100644 --- a/test/CodeGen/AMDGPU/imm.ll +++ b/test/CodeGen/AMDGPU/imm.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}i64_imm_inline_lo: ; GCN: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]: -define void @i64_imm_inline_lo(i64 addrspace(1) *%out) { +define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) { entry: store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005 ret void @@ -15,7 +15,7 @@ entry: ; GCN-LABEL: {{^}}i64_imm_inline_hi: ; GCN: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]] -define void @i64_imm_inline_hi(i64 addrspace(1) *%out) { +define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) { entry: store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678 ret void @@ -25,7 +25,7 @@ entry: ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { +define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { store i64 -9223372036854775808, i64 addrspace(1) *%out ret void } @@ -33,7 +33,7 @@ define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i32: ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { store i32 -2147483648, i32 addrspace(1)* %out ret void } @@ -41,7 +41,7 @@ define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_0.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) { store float 0.0, float addrspace(1)* %out ret void } @@ -49,7 +49,7 @@ define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_imm_neg_0.0_f32: ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) { store float -0.0, float addrspace(1)* %out ret void } @@ -57,7 +57,7 @@ define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_0.5_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) { store float 0.5, float addrspace(1)* %out ret void } @@ -65,7 +65,7 @@ define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) { store float -0.5, float addrspace(1)* %out ret void } @@ -73,7 +73,7 @@ define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_1.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) { store float 1.0, float addrspace(1)* %out ret void } @@ -81,7 +81,7 @@ define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) { store float -1.0, float addrspace(1)* %out ret void } @@ -89,7 +89,7 @@ define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_2.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) { store float 2.0, float addrspace(1)* %out ret void } @@ -97,7 +97,7 @@ define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) { store float -2.0, float addrspace(1)* %out ret void } @@ -105,7 +105,7 @@ define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_4.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) { store float 4.0, float addrspace(1)* %out ret void } @@ -113,7 +113,7 @@ define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) { store float -4.0, float addrspace(1)* %out ret void } @@ -123,7 +123,7 @@ define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) { ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e22f983{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0.15915494{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) { store float 0x3FC45F3060000000, float addrspace(1)* %out ret void } @@ -131,7 +131,7 @@ define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f32: ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbe22f983{{$}} ; GCN: buffer_store_dword [[REG]] -define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) { store float 0xBFC45F3060000000, float addrspace(1)* %out ret void } @@ -139,7 +139,7 @@ define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_literal_imm_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000 ; GCN: buffer_store_dword [[REG]] -define void @store_literal_imm_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) { store float 4096.0, float addrspace(1)* %out ret void } @@ -148,7 +148,7 @@ define void @store_literal_imm_f32(float addrspace(1)* %out) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0.0 store float %y, float addrspace(1)* %out ret void @@ -158,7 +158,7 @@ define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0.5 store float %y, float addrspace(1)* %out ret void @@ -168,7 +168,7 @@ define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, -0.5 store float %y, float addrspace(1)* %out ret void @@ -178,7 +178,7 @@ define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 1.0 store float %y, float addrspace(1)* %out ret void @@ -188,7 +188,7 @@ define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, -1.0 store float %y, float addrspace(1)* %out ret void @@ -198,7 +198,7 @@ define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 2.0 store float %y, float addrspace(1)* %out ret void @@ -208,7 +208,7 @@ define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, -2.0 store float %y, float addrspace(1)* %out ret void @@ -218,7 +218,7 @@ define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 4.0 store float %y, float addrspace(1)* %out ret void @@ -228,7 +228,7 @@ define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, -4.0 store float %y, float addrspace(1)* %out ret void @@ -238,7 +238,7 @@ define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]] ; GCN: buffer_store_dword [[REG]] -define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %x = load float, float addrspace(1)* %in %y = fadd float %x, 0.5 store float %y, float addrspace(1)* %out @@ -249,7 +249,7 @@ define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addr ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]] ; GCN: buffer_store_dword [[REG]] -define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %x = load float, float addrspace(1)* %in %y = fadd float %x, 1024.0 store float %y, float addrspace(1)* %out @@ -260,7 +260,7 @@ define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1 ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x36a0000000000000 store float %y, float addrspace(1)* %out ret void @@ -270,7 +270,7 @@ define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x36b0000000000000 store float %y, float addrspace(1)* %out ret void @@ -280,7 +280,7 @@ define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 16 ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x36e0000000000000 store float %y, float addrspace(1)* %out ret void @@ -290,7 +290,7 @@ define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0xffffffffe0000000 store float %y, float addrspace(1)* %out ret void @@ -300,7 +300,7 @@ define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2{{$}} ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0xffffffffc0000000 store float %y, float addrspace(1)* %out ret void @@ -310,7 +310,7 @@ define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -16 ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0xfffffffe00000000 store float %y, float addrspace(1)* %out ret void @@ -320,7 +320,7 @@ define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 63 ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x36ff800000000000 store float %y, float addrspace(1)* %out ret void @@ -330,7 +330,7 @@ define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 64 ; GCN: buffer_store_dword [[REG]] -define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { +define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { %y = fadd float %x, 0x3700000000000000 store float %y, float addrspace(1)* %out ret void @@ -342,7 +342,7 @@ define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0.0 store double %y, double addrspace(1)* %out ret void @@ -353,7 +353,7 @@ define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0.5 store double %y, double addrspace(1)* %out ret void @@ -364,7 +364,7 @@ define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -0.5 store double %y, double addrspace(1)* %out ret void @@ -375,7 +375,7 @@ define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 1.0 store double %y, double addrspace(1)* %out ret void @@ -386,7 +386,7 @@ define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -1.0 store double %y, double addrspace(1)* %out ret void @@ -397,7 +397,7 @@ define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 2.0 store double %y, double addrspace(1)* %out ret void @@ -408,7 +408,7 @@ define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -2.0 store double %y, double addrspace(1)* %out ret void @@ -419,7 +419,7 @@ define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 4.0 store double %y, double addrspace(1)* %out ret void @@ -430,7 +430,7 @@ define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, -4.0 store double %y, double addrspace(1)* %out ret void @@ -445,7 +445,7 @@ define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}} ; VI: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x3fc45f306dc9c882 store double %y, double addrspace(1)* %out ret void @@ -455,7 +455,7 @@ define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xbfc45f306dc9c882 store double %y, double addrspace(1)* %out ret void @@ -466,7 +466,7 @@ define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000001 store double %y, double addrspace(1)* %out ret void @@ -477,7 +477,7 @@ define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}} ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000002 store double %y, double addrspace(1)* %out ret void @@ -488,7 +488,7 @@ define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000010 store double %y, double addrspace(1)* %out ret void @@ -499,7 +499,7 @@ define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xffffffffffffffff store double %y, double addrspace(1)* %out ret void @@ -510,7 +510,7 @@ define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xfffffffffffffffe store double %y, double addrspace(1)* %out ret void @@ -521,7 +521,7 @@ define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -16 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0xfffffffffffffff0 store double %y, double addrspace(1)* %out ret void @@ -532,7 +532,7 @@ define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x000000000000003F store double %y, double addrspace(1)* %out ret void @@ -543,7 +543,7 @@ define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64 ; GCN: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { +define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { %y = fadd double %x, 0x0000000000000040 store double %y, double addrspace(1)* %out ret void @@ -554,7 +554,7 @@ define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { ; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0 ; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], v[[LO_VREG]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { store double 0.0, double addrspace(1)* %out ret void } @@ -564,7 +564,7 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { store double -0.0, double addrspace(1)* %out ret void } @@ -573,7 +573,7 @@ define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) { store double 0.5, double addrspace(1)* %out ret void } @@ -582,7 +582,7 @@ define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) { store double -0.5, double addrspace(1)* %out ret void } @@ -591,7 +591,7 @@ define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) { store double 1.0, double addrspace(1)* %out ret void } @@ -600,7 +600,7 @@ define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) { store double -1.0, double addrspace(1)* %out ret void } @@ -609,7 +609,7 @@ define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) { store double 2.0, double addrspace(1)* %out ret void } @@ -618,7 +618,7 @@ define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) { store double -2.0, double addrspace(1)* %out ret void } @@ -627,7 +627,7 @@ define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) { store double 4.0, double addrspace(1)* %out ret void } @@ -636,7 +636,7 @@ define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { store double -4.0, double addrspace(1)* %out ret void } @@ -645,7 +645,7 @@ define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inv_2pi_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) { store double 0x3fc45f306dc9c882, double addrspace(1)* %out ret void } @@ -654,7 +654,7 @@ define void @store_inv_2pi_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) { store double 0xbfc45f306dc9c882, double addrspace(1)* %out ret void } @@ -663,7 +663,22 @@ define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) { ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_literal_imm_f64(double addrspace(1)* %out) { +define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) { store double 4096.0, double addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}literal_folding: +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}} +define amdgpu_vs void @literal_folding(float %arg) { +main_body: + %tmp = fmul float %arg, 0x3FE86A7F00000000 + %tmp1 = fmul float %arg, 0xBFE86A7F00000000 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp, float %tmp, float %tmp1, float %tmp1, i1 true, i1 false) #0 + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/imm16.ll b/test/CodeGen/AMDGPU/imm16.ll index 2e73eb06502f..e42d58791890 100644 --- a/test/CodeGen/AMDGPU/imm16.ll +++ b/test/CodeGen/AMDGPU/imm16.ll @@ -7,7 +7,7 @@ ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) { store volatile i16 -32768, i16 addrspace(1)* %out ret void } @@ -15,7 +15,7 @@ define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_0.0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) { store half 0.0, half addrspace(1)* %out ret void } @@ -24,7 +24,7 @@ define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) { ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) { store half -0.0, half addrspace(1)* %out ret void } @@ -32,7 +32,7 @@ define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_0.5_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) { store half 0.5, half addrspace(1)* %out ret void } @@ -41,7 +41,7 @@ define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) { ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) { store half -0.5, half addrspace(1)* %out ret void } @@ -49,7 +49,7 @@ define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_1.0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) { store half 1.0, half addrspace(1)* %out ret void } @@ -58,7 +58,7 @@ define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) { ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) { store half -1.0, half addrspace(1)* %out ret void } @@ -66,7 +66,7 @@ define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_2.0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) { store half 2.0, half addrspace(1)* %out ret void } @@ -75,7 +75,7 @@ define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) { ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) { store half -2.0, half addrspace(1)* %out ret void } @@ -83,7 +83,7 @@ define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_4.0_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) { store half 4.0, half addrspace(1)* %out ret void } @@ -92,7 +92,7 @@ define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) { ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) { store half -4.0, half addrspace(1)* %out ret void } @@ -101,7 +101,7 @@ define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) { store half 0xH3118, half addrspace(1)* %out ret void } @@ -110,7 +110,7 @@ define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) { ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}} ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}} ; GCN: buffer_store_short [[REG]] -define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) { store half 0xHB118, half addrspace(1)* %out ret void } @@ -118,7 +118,7 @@ define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) { ; GCN-LABEL: {{^}}store_literal_imm_f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00 ; GCN: buffer_store_short [[REG]] -define void @store_literal_imm_f16(half addrspace(1)* %out) { +define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) { store half 4096.0, half addrspace(1)* %out ret void } @@ -127,7 +127,7 @@ define void @store_literal_imm_f16(half addrspace(1)* %out) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0.0 store half %y, half addrspace(1)* %out ret void @@ -137,7 +137,7 @@ define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0.5 store half %y, half addrspace(1)* %out ret void @@ -147,7 +147,7 @@ define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -0.5 store half %y, half addrspace(1)* %out ret void @@ -157,7 +157,7 @@ define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 1.0 store half %y, half addrspace(1)* %out ret void @@ -167,7 +167,7 @@ define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -1.0 store half %y, half addrspace(1)* %out ret void @@ -177,7 +177,7 @@ define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 2.0 store half %y, half addrspace(1)* %out ret void @@ -187,7 +187,7 @@ define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -2.0 store half %y, half addrspace(1)* %out ret void @@ -197,7 +197,7 @@ define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 4.0 store half %y, half addrspace(1)* %out ret void @@ -207,7 +207,7 @@ define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -4.0 store half %y, half addrspace(1)* %out ret void @@ -217,7 +217,7 @@ define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]] ; VI: buffer_store_short [[REG]] -define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %x = load half, half addrspace(1)* %in %y = fadd half %x, 0.5 store half %y, half addrspace(1)* %out @@ -228,7 +228,7 @@ define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrsp ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]] ; VI: buffer_store_short [[REG]] -define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %x = load half, half addrspace(1)* %in %y = fadd half %x, 1024.0 store half %y, half addrspace(1)* %out @@ -239,7 +239,7 @@ define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0001 store half %y, half addrspace(1)* %out ret void @@ -249,7 +249,7 @@ define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0002 store half %y, half addrspace(1)* %out ret void @@ -259,7 +259,7 @@ define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0010 store half %y, half addrspace(1)* %out ret void @@ -269,7 +269,7 @@ define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xHFFFF store half %y, half addrspace(1)* %out ret void @@ -279,7 +279,7 @@ define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xHFFFE store half %y, half addrspace(1)* %out ret void @@ -289,7 +289,7 @@ define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}} ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xHFFF0 store half %y, half addrspace(1)* %out ret void @@ -299,7 +299,7 @@ define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]] ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH003F store half %y, half addrspace(1)* %out ret void @@ -309,7 +309,7 @@ define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) { ; VI: buffer_load_ushort [[VAL:v[0-9]+]] ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]] ; VI: buffer_store_short [[REG]] -define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) { +define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0040 store half %y, half addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll new file mode 100644 index 000000000000..85ad365d02a8 --- /dev/null +++ b/test/CodeGen/AMDGPU/immv216.ll @@ -0,0 +1,446 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; FIXME: Merge into imm.ll + +; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 { + store <2 x i16> , <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_literal_imm_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> , <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16: +; GFX9: buffer_load_dword [[VAL:v[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_dword +; VI-NOT: and +; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %x = load <2 x half>, <2 x half> addrspace(1)* %in + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}commute_add_literal_v2f16: +; GFX9: buffer_load_dword [[VAL:v[0-9]+]] +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x64006400 +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]] +; GFX9: buffer_store_dword [[REG]] + +; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} +; VI-DAG: buffer_load_dword +; VI-NOT: and +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: buffer_store_dword +define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %x = load <2 x half>, <2 x half> addrspace(1)* %in + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_1_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_2_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_16_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -16{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_63_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63 +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_64_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64 +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll index 877956be3088..8e207a38c847 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll @@ -10,7 +10,7 @@ ; CHECK: s_mov_b32 m0, [[IN]] ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]] ; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]: -define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { entry: %ins = insertelement <4 x float> , float 5.0, i32 %in store <4 x float> %ins, <4 x float> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 208e55c143ac..b18ae353ca4c 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. @@ -18,7 +19,7 @@ ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_w_offset(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) { entry: %idx = add i32 %in, 1 %elt = extractelement <4 x float> , i32 %idx @@ -43,7 +44,7 @@ entry: ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) { +define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) { entry: %idx = add i32 %in, 1 %vec = or <4 x i32> %or.val, @@ -65,7 +66,7 @@ entry: ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { entry: %elt = extractelement <4 x float> , i32 %in store float %elt, float addrspace(1)* %out @@ -83,7 +84,7 @@ entry: ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { +define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { entry: %index = add i32 %offset, -512 %value = extractelement <4 x i32> , i32 %index @@ -104,7 +105,7 @@ entry: ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) { +define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) { entry: %index = add i32 %offset, -512 %or = or <4 x i32> %vec0, %vec1 @@ -136,7 +137,7 @@ entry: ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[RESULT]] -define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { +define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -512 @@ -146,7 +147,7 @@ entry: } ; GCN-LABEL: {{^}}extract_undef_offset_sgpr: -define void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %value = extractelement <4 x i32> %ld, i32 undef @@ -158,7 +159,7 @@ entry: ; GCN-DAG: buffer_load_dwordx4 ; MOVREL-DAG: s_mov_b32 m0, ; MOVREL: v_movreld_b32 -define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %value = insertelement <4 x i32> %ld, i32 5, i32 undef @@ -177,7 +178,7 @@ entry: ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]] ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}} -define void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) { entry: %0 = add i32 %in, 1 %1 = insertelement <4 x float> , float 5.0, i32 %0 @@ -196,7 +197,7 @@ entry: ; IDXMODE-NEXT: s_set_gpr_idx_off ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]: -define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { entry: %0 = insertelement <4 x float> , float 5.0, i32 %in store <4 x float> %0, <4 x float> addrspace(1)* %out @@ -212,7 +213,7 @@ entry: ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { +define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { entry: %index = add i32 %offset, -512 %value = insertelement <4 x i32> , i32 5, i32 %index @@ -232,7 +233,7 @@ entry: ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off -define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) { +define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) { entry: %index = add i32 %offset, -512 %value = insertelement <4 x i32> %vec, i32 5, i32 %index @@ -269,7 +270,7 @@ entry: ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword -define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -512 @@ -304,7 +305,7 @@ entry: ; GCN: s_cbranch_execnz ; IDXMODE: s_set_gpr_idx_off -define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -16 @@ -374,7 +375,7 @@ entry: ; GCN: buffer_store_dword [[MOVREL0]] ; GCN: buffer_store_dword [[MOVREL1]] -define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %id.ext = zext i32 %id to i64 @@ -449,7 +450,7 @@ bb2: ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: ; GCN: buffer_store_dword [[INS0]] -define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { +define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %id.ext = zext i32 %id to i64 @@ -498,7 +499,7 @@ bb2: ; GCN: [[ENDBB]]: ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @extract_adjacent_blocks(i32 %arg) #0 { +define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 { bb: %tmp = icmp eq i32 %arg, 0 br i1 %tmp, label %bb1, label %bb4 @@ -548,7 +549,7 @@ bb7: ; GCN: [[ENDBB]]: ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 { +define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 { bb: %tmp = icmp eq i32 %arg, 0 br i1 %tmp, label %bb1, label %bb4 @@ -609,7 +610,7 @@ bb7: ; preds = %bb4, %bb1 ; GCN: ds_write_b32 ; GCN: ds_write_b32 ; GCN: s_endpgm -define void @multi_same_block(i32 %arg) #0 { +define amdgpu_kernel void @multi_same_block(i32 %arg) #0 { bb: %tmp1 = add i32 %arg, -16 %tmp2 = insertelement <6 x float> , float 4.000000e+00, i32 %tmp1 @@ -636,7 +637,7 @@ bb: ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] -define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { entry: %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %offset = add i32 %idx, 3 @@ -657,7 +658,7 @@ entry: ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] -define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { +define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) { entry: %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %offset = add i32 %idx, 4 @@ -680,7 +681,7 @@ entry: ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off -define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) { +define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) { entry: %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in %idx.shl = shl i32 %idx.in, 2 @@ -701,7 +702,7 @@ entry: ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off -define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind { +define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind { %idx.shl = shl i32 %idx.in, 2 %idx = or i32 %idx.shl, 1 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx @@ -728,7 +729,7 @@ define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x f ; IDXMODE: s_set_gpr_idx_idx ; IDXMODE: v_mov_b32_e32 ; GCN: s_cbranch_execnz [[REGLOOP]] -define void @broken_phi_bb(i32 %arg, i32 %arg1) #0 { +define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 { bb: br label %bb2 diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll index 4db87c3c1b64..7f08a89d149e 100644 --- a/test/CodeGen/AMDGPU/indirect-private-64.ll +++ b/test/CodeGen/AMDGPU/indirect-private-64.ll @@ -20,10 +20,10 @@ declare void @llvm.amdgcn.s.barrier() #0 ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write_b64 ; CI-PROMOTE: ds_read_b64 -define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 { +define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 { %val = load double, double addrspace(1)* %in, align 8 - %array = alloca [16 x double], align 8 - %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b + %array = alloca [8 x double], align 8 + %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b store double %val, double* %ptr, align 8 call void @llvm.amdgcn.s.barrier() %result = load double, double* %ptr, align 8 @@ -51,10 +51,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write2_b64 ; CI-PROMOTE: ds_read2_b64 -define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 { +define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 - %array = alloca [8 x <2 x double>], align 16 - %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b + %array = alloca [4 x <2 x double>], align 16 + %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b store <2 x double> %val, <2 x double>* %ptr, align 16 call void @llvm.amdgcn.s.barrier() %result = load <2 x double>, <2 x double>* %ptr, align 16 @@ -77,7 +77,7 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write_b64 ; CI-PROMOTE: ds_read_b64 -define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 { +define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 { %val = load i64, i64 addrspace(1)* %in, align 8 %array = alloca [8 x i64], align 8 %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b @@ -109,10 +109,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs ; SI-PROMOTE: ds_read_b64 ; CI-PROMOTE: ds_write2_b64 ; CI-PROMOTE: ds_read2_b64 -define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 { +define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 - %array = alloca [8 x <2 x i64>], align 16 - %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b + %array = alloca [4 x <2 x i64>], align 16 + %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b store <2 x i64> %val, <2 x i64>* %ptr, align 16 call void @llvm.amdgcn.s.barrier() %result = load <2 x i64>, <2 x i64>* %ptr, align 16 @@ -121,4 +121,4 @@ define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, < } attributes #0 = { convergent nounwind } -attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="64,64" } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="64,128" } diff --git a/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll index 990f33518ab9..7cee8a41c120 100644 --- a/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll +++ b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll @@ -2,7 +2,7 @@ ; REQUIRES: asserts ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -define void @inf_loop_irreducible_cfg() nounwind { +define amdgpu_kernel void @inf_loop_irreducible_cfg() nounwind { entry: br label %block diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll index 3e0b695934c7..73482756b8c8 100644 --- a/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/test/CodeGen/AMDGPU/infinite-loop.ll @@ -7,7 +7,7 @@ ; SI: buffer_store_dword [[REG]] ; SI: s_waitcnt vmcnt(0) expcnt(0) ; SI: s_branch BB0_1 -define void @infinite_loop(i32 addrspace(1)* %out) { +define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) { entry: br label %for.body diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll index db1a0c67436d..0d7e07b9a624 100644 --- a/test/CodeGen/AMDGPU/inline-asm.ll +++ b/test/CodeGen/AMDGPU/inline-asm.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: {{^}}inline_asm: ; CHECK: s_endpgm ; CHECK: s_endpgm -define void @inline_asm(i32 addrspace(1)* %out) { +define amdgpu_kernel void @inline_asm(i32 addrspace(1)* %out) { entry: store i32 5, i32 addrspace(1)* %out call void asm sideeffect "s_endpgm", ""() @@ -25,7 +25,7 @@ entry: ; Make sure inline assembly is treted as divergent. ; CHECK: s_mov_b32 s{{[0-9]+}}, 0 ; CHECK: s_and_saveexec_b64 -define void @branch_on_asm(i32 addrspace(1)* %out) { +define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) { %zero = call i32 asm "s_mov_b32 $0, 0", "=s"() %cmp = icmp eq i32 %zero, 0 br i1 %cmp, label %if, label %endif @@ -44,7 +44,7 @@ endif: ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]] ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]] ; CHECK: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) { %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in) store i64 %sgpr, i64 addrspace(1)* %out ret void @@ -52,7 +52,7 @@ define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) { ; CHECK-LABEL: {{^}}code_size_inline_asm: ; CHECK: codeLenInByte = 12 -define void @code_size_inline_asm(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm(i32 addrspace(1)* %out) { entry: call void asm sideeffect "v_nop_e64", ""() ret void @@ -61,7 +61,7 @@ entry: ; All inlineasm instructions are assumed to be the maximum size ; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst: ; CHECK: codeLenInByte = 12 -define void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) { entry: call void asm sideeffect "v_nop_e32", ""() ret void @@ -69,7 +69,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) { entry: call void asm sideeffect " v_nop_e64 @@ -80,7 +80,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) { entry: call void asm sideeffect " v_nop_e64 @@ -92,7 +92,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) { entry: call void asm sideeffect "", ""() ret void @@ -100,7 +100,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; comment", ""() ret void @@ -108,7 +108,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) { entry: call void asm sideeffect " ; comment", ""() @@ -117,7 +117,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; comment ", ""() @@ -126,7 +126,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; first comment ; second comment", ""() ret void @@ -134,7 +134,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace: ; CHECK: codeLenInByte = 4 -define void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; first comment;second comment", ""() ret void @@ -142,7 +142,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; comment v_nop_e64 ; inline comment @@ -157,7 +157,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) { entry: call void asm sideeffect "v_nop_e64 ; inline comment ; separate comment @@ -171,7 +171,7 @@ entry: ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands: ; CHECK: codeLenInByte = 20 -define void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) { +define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) { entry: call void asm sideeffect "; comment v_add_i32_e32 v0, vcc, v1, v2 ; inline comment @@ -183,3 +183,52 @@ entry: ", ""() ret void } + +; FIXME: Should not have intermediate sgprs +; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr: +; CHECK: s_mov_b32 s1, 0 +; CHECK: s_mov_b32 s0, 0x1e240 +; CHECK: v_mov_b32_e32 v0, s0 +; CHECK: v_mov_b32_e32 v1, s1 +; CHECK: use v[0:1] +define void @i64_imm_input_phys_vgpr() { +entry: + call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456) + ret void +} + +; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr: +; CHECK: v_mov_b32_e32 v0, -1{{$}} +; CHECK: ; use v0 +define amdgpu_kernel void @i1_imm_input_phys_vgpr() { +entry: + call void asm sideeffect "; use $0 ", "{VGPR0}"(i1 true) + ret void +} + +; CHECK-LABEL: {{^}}i1_input_phys_vgpr: +; CHECK: {{buffer|flat}}_load_ubyte [[LOAD:v[0-9]+]] +; CHECK: v_and_b32_e32 [[LOAD]], 1, [[LOAD]] +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]] +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; CHECK: ; use v0 +define amdgpu_kernel void @i1_input_phys_vgpr() { +entry: + %val = load i1, i1 addrspace(1)* undef + call void asm sideeffect "; use $0 ", "{VGPR0}"(i1 %val) + ret void +} + +; FIXME: Should be scheduled to shrink vcc +; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2: +; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc +; CHECK: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +define amdgpu_kernel void @i1_input_phys_vgpr_x2() { +entry: + %val0 = load volatile i1, i1 addrspace(1)* undef + %val1 = load volatile i1, i1 addrspace(1)* undef + call void asm sideeffect "; use $0 $1 ", "{VGPR0}, {VGPR1}"(i1 %val0, i1 %val1) + ret void +} diff --git a/test/CodeGen/AMDGPU/inline-calls.ll b/test/CodeGen/AMDGPU/inline-calls.ll index 4541a902c1b8..f8821f319893 100644 --- a/test/CodeGen/AMDGPU/inline-calls.ll +++ b/test/CodeGen/AMDGPU/inline-calls.ll @@ -11,7 +11,7 @@ entry: ; CHECK: {{^}}kernel: ; CHECK-NOT: call -define void @kernel(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) { entry: %tmp0 = call i32 @func(i32 1) store i32 %tmp0, i32 addrspace(1)* %out @@ -20,7 +20,7 @@ entry: ; CHECK: {{^}}kernel2: ; CHECK-NOT: call -define void @kernel2(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel2(i32 addrspace(1)* %out) { entry: call void @kernel(i32 addrspace(1)* %out) ret void @@ -31,7 +31,7 @@ entry: ; CHECK: {{^}}kernel3: ; CHECK-NOT: call -define void @kernel3(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel3(i32 addrspace(1)* %out) { entry: %tmp0 = call i32 @func_alias(i32 1) store i32 %tmp0, i32 addrspace(1)* %out @@ -43,7 +43,7 @@ entry: ; CHECK: {{^}}kernel4: ; CHECK-NOT: call -define void @kernel4(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel4(i32 addrspace(1)* %out) { entry: call void @kernel_alias(i32 addrspace(1)* %out) ret void diff --git a/test/CodeGen/AMDGPU/inline-constraints.ll b/test/CodeGen/AMDGPU/inline-constraints.ll index 1bcbd14009ce..941a1b90dcc1 100644 --- a/test/CodeGen/AMDGPU/inline-constraints.ll +++ b/test/CodeGen/AMDGPU/inline-constraints.ll @@ -10,7 +10,7 @@ ; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] ; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] -define void @inline_reg_constraints(i32 addrspace(1)* %ptr) { +define amdgpu_kernel void @inline_reg_constraints(i32 addrspace(1)* %ptr) { entry: %v32 = tail call i32 asm sideeffect "flat_load_dword $0, $1", "=v,v"(i32 addrspace(1)* %ptr) %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr) @@ -27,7 +27,7 @@ entry: ; GCN: s_mov_b32 m0, -1 ; GCN: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; GCN: ; use [[COPY_M0]] -define void @inline_sreg_constraint_m0() { +define amdgpu_kernel void @inline_sreg_constraint_m0() { %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"() tail call void asm sideeffect "; use $0", "s"(i32 %m0) ret void @@ -36,7 +36,7 @@ define void @inline_sreg_constraint_m0() { ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i32: ; GCN: s_mov_b32 [[REG:s[0-9]+]], 32 ; GCN: ; use [[REG]] -define void @inline_sreg_constraint_imm_i32() { +define amdgpu_kernel void @inline_sreg_constraint_imm_i32() { tail call void asm sideeffect "; use $0", "s"(i32 32) ret void } @@ -44,7 +44,7 @@ define void @inline_sreg_constraint_imm_i32() { ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f32: ; GCN: s_mov_b32 [[REG:s[0-9]+]], 1.0 ; GCN: ; use [[REG]] -define void @inline_sreg_constraint_imm_f32() { +define amdgpu_kernel void @inline_sreg_constraint_imm_f32() { tail call void asm sideeffect "; use $0", "s"(float 1.0) ret void } @@ -54,7 +54,7 @@ define void @inline_sreg_constraint_imm_f32() { ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}} ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}} ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}} -define void @inline_sreg_constraint_imm_i64() { +define amdgpu_kernel void @inline_sreg_constraint_imm_i64() { tail call void asm sideeffect "; use $0", "s"(i64 -4) ret void } @@ -63,7 +63,7 @@ define void @inline_sreg_constraint_imm_i64() { ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}} ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}} ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}} -define void @inline_sreg_constraint_imm_f64() { +define amdgpu_kernel void @inline_sreg_constraint_imm_f64() { tail call void asm sideeffect "; use $0", "s"(double 1.0) ret void } diff --git a/test/CodeGen/AMDGPU/inlineasm-16.ll b/test/CodeGen/AMDGPU/inlineasm-16.ll index 75f3158937dc..15e57fe6bffb 100644 --- a/test/CodeGen/AMDGPU/inlineasm-16.ll +++ b/test/CodeGen/AMDGPU/inlineasm-16.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}s_input_output_i16: ; SICI: error: couldn't allocate output register for constraint 's' ; SICI: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_i16() #0 { +define amdgpu_kernel void @s_input_output_i16() #0 { %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(i16 %v) #0 ret void @@ -14,7 +14,7 @@ define void @s_input_output_i16() #0 { ; GCN-LABEL: {{^}}v_input_output_i16: ; SICI: error: couldn't allocate output register for constraint 'v' ; SICI: error: couldn't allocate input reg for constraint 'v' -define void @v_input_output_i16() #0 { +define amdgpu_kernel void @v_input_output_i16() #0 { %v = tail call i16 asm sideeffect "v_mov_b32 $0, -1", "=v"() #0 tail call void asm sideeffect "; use $0", "v"(i16 %v) ret void @@ -23,7 +23,7 @@ define void @v_input_output_i16() #0 { ; GCN-LABEL: {{^}}s_input_output_f16: ; SICI: error: couldn't allocate output register for constraint 's' ; SICI: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_f16() #0 { +define amdgpu_kernel void @s_input_output_f16() #0 { %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"() #0 tail call void asm sideeffect "; use $0", "s"(half %v) ret void @@ -32,7 +32,7 @@ define void @s_input_output_f16() #0 { ; GCN-LABEL: {{^}}v_input_output_f16: ; SICI: error: couldn't allocate output register for constraint 'v' ; SICI: error: couldn't allocate input reg for constraint 'v' -define void @v_input_output_f16() #0 { +define amdgpu_kernel void @v_input_output_f16() #0 { %v = tail call half asm sideeffect "v_mov_b32 $0, -1", "=v"() #0 tail call void asm sideeffect "; use $0", "v"(half %v) ret void diff --git a/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll index 2eb21f07e0ec..c1d67ba614c6 100644 --- a/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll +++ b/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll @@ -3,7 +3,7 @@ ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_i8() { +define amdgpu_kernel void @s_input_output_i8() { %v = tail call i8 asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(i8 %v) ret void @@ -11,7 +11,7 @@ define void @s_input_output_i8() { ; GCN: error: couldn't allocate output register for constraint 'v' ; GCN: error: couldn't allocate input reg for constraint 'v' -define void @v_input_output_i8() { +define amdgpu_kernel void @v_input_output_i8() { %v = tail call i8 asm sideeffect "v_mov_b32 $0, -1", "=v"() tail call void asm sideeffect "; use $0", "v"(i8 %v) ret void @@ -19,7 +19,7 @@ define void @v_input_output_i8() { ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_i128() { +define amdgpu_kernel void @s_input_output_i128() { %v = tail call i128 asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(i128 %v) ret void @@ -27,7 +27,7 @@ define void @s_input_output_i128() { ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_v8f16() { +define amdgpu_kernel void @s_input_output_v8f16() { %v = tail call <8 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<8 x half> %v) ret void @@ -36,7 +36,7 @@ define void @s_input_output_v8f16() { ; CI: error: couldn't allocate output register for constraint 's' ; CI: error: couldn't allocate input reg for constraint 's' ; VI-NOT: error -define void @s_input_output_f16() { +define amdgpu_kernel void @s_input_output_f16() { %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(half %v) ret void @@ -44,7 +44,7 @@ define void @s_input_output_f16() { ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_v2f16() { +define amdgpu_kernel void @s_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x half> %v) ret void @@ -52,7 +52,7 @@ define void @s_input_output_v2f16() { ; GCN: error: couldn't allocate output register for constraint 'v' ; GCN: error: couldn't allocate input reg for constraint 'v' -define void @v_input_output_v2f16() { +define amdgpu_kernel void @v_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"() tail call void asm sideeffect "; use $0", "v"(<2 x half> %v) ret void @@ -61,7 +61,7 @@ define void @v_input_output_v2f16() { ; CI: error: couldn't allocate output register for constraint 's' ; CI: error: couldn't allocate input reg for constraint 's' ; VI-NOT: error -define void @s_input_output_i16() { +define amdgpu_kernel void @s_input_output_i16() { %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(i16 %v) ret void @@ -69,14 +69,14 @@ define void @s_input_output_i16() { ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define void @s_input_output_v2i16() { +define amdgpu_kernel void @s_input_output_v2i16() { %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v) ret void } ; FIXME: Crash in codegen prepare -; define void @s_input_output_i3() { +; define amdgpu_kernel void @s_input_output_i3() { ; %v = tail call i3 asm sideeffect "s_mov_b32 $0, -1", "=s"() ; tail call void asm sideeffect "; use $0", "s"(i3 %v) ; ret void diff --git a/test/CodeGen/AMDGPU/inlineasm-packed.ll b/test/CodeGen/AMDGPU/inlineasm-packed.ll new file mode 100644 index 000000000000..3c6c7e1d1b42 --- /dev/null +++ b/test/CodeGen/AMDGPU/inlineasm-packed.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s + +; GCN-LABEL: {{^}}inline_asm_input_v2i16: +; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +define amdgpu_kernel void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 { +entry: + %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x i16> %in) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_input_v2f16: +; GCN: s_mov_b32 s0, s{{[0-9]+}} +define amdgpu_kernel void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 { +entry: + %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_output_v2i16: +; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +define amdgpu_kernel void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 { +entry: + %val = call <2 x i16> asm "s_mov_b32 $0, $1", "=r,r"(i32 %in) #0 + store <2 x i16> %val, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_output_v2f16: +; GCN: v_mov_b32 v{{[0-9]+}}, s{{[0-9]+}} +define amdgpu_kernel void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { +entry: + %val = call <2 x half> asm "v_mov_b32 $0, $1", "=v,r"(i32 %in) #0 + store <2 x half> %val, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_packed_v2i16: +; GCN: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 { +entry: + %val = call <2 x i16> asm "v_pk_add_u16 $0, $1, $2", "=v,r,v"(<2 x i16> %in0, <2 x i16> %in1) #0 + store <2 x i16> %val, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_packed_v2f16: +; GCN: v_pk_add_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 { +entry: + %val = call <2 x half> asm "v_pk_add_f16 $0, $1, $2", "=v,r,v"(<2 x half> %in0, <2 x half> %in1) #0 + store <2 x half> %val, <2 x half> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir b/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir new file mode 100644 index 000000000000..bd5f296affb5 --- /dev/null +++ b/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir @@ -0,0 +1,40 @@ +# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold=1 %s -o - | FileCheck %s +# https://bugs.freedesktop.org/show_bug.cgi?id=99019 +--- | + define amdgpu_ps void @kill_uncond_branch() { + ret void + } +... +--- + +# CHECK-LABEL: name: kill_uncond_branch + +# CHECK: bb.0: +# CHECK: S_CBRANCH_VCCNZ %bb.1, implicit %vcc + +# CHECK: bb.1: +# CHECK: V_CMPX_LE_F32_e32 +# CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit %exec + +# CHECK: bb.3: +# CHECK-NEXT: EXP_DONE +# CHECK: S_ENDPGM + +# CHECK: bb.2: +# CHECK: S_ENDPGM + +name: kill_uncond_branch + +body: | + bb.0: + successors: %bb.1 + S_CBRANCH_VCCNZ %bb.1, implicit %vcc + + bb.1: + successors: %bb.2 + %vgpr0 = V_MOV_B32_e32 0, implicit %exec + SI_KILL_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM diff --git a/test/CodeGen/AMDGPU/insert-waits-callee.mir b/test/CodeGen/AMDGPU/insert-waits-callee.mir new file mode 100644 index 000000000000..ad7cd0cc8abf --- /dev/null +++ b/test/CodeGen/AMDGPU/insert-waits-callee.mir @@ -0,0 +1,25 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s +--- | + define float @entry_callee_wait(float %arg) #0 { + ret float %arg + } + + attributes #0 = { nounwind } +... +--- +# CHECK-LABEL: name: entry_callee_wait{{$}} +# CHECK: bb.0: +# CHECK-NEXT: S_WAITCNT 0{{$}} +# CHECK-NEXT: V_ADD_F32 +# CHECK-NEXT: S_SETPC_B64 +liveins: + - { reg: '%sgpr0_sgpr1' } + - { reg: '%vgpr0' } + +name: entry_callee_wait +body: | + bb.0: + %vgpr0 = V_ADD_F32_e32 %vgpr0, %vgpr0, implicit %exec + S_SETPC_B64 killed %sgpr0_sgpr1 + +... diff --git a/test/CodeGen/AMDGPU/insert-waits-exp.mir b/test/CodeGen/AMDGPU/insert-waits-exp.mir index 9aaa374ed28e..1055201ce3dd 100644 --- a/test/CodeGen/AMDGPU/insert-waits-exp.mir +++ b/test/CodeGen/AMDGPU/insert-waits-exp.mir @@ -1,18 +1,18 @@ # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s --- | - define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { + define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x + i32> inreg, i32 inreg %w, float %v) #0 { %a = load volatile float, float addrspace(1)* undef %b = load volatile float, float addrspace(1)* undef %c = load volatile float, float addrspace(1)* undef %d = load volatile float, float addrspace(1)* undef - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %a, float %b, float %c, float %d) + call void @llvm.amdgcn.exp.f32(i32 15, i32 1, float %a, float %b, float %c, float %d, i1 true, i1 false) ret <4 x float> } - declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 - attributes #0 = { readnone } - attributes #1 = { nounwind } + attributes #0 = { nounwind } ... --- @@ -58,6 +58,6 @@ body: | %vgpr1 = V_MOV_B32_e32 1065353216, implicit %exec %vgpr2 = V_MOV_B32_e32 1073741824, implicit %exec %vgpr3 = V_MOV_B32_e32 1082130432, implicit %exec - SI_RETURN killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3 + SI_RETURN_TO_EPILOG killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3 ... diff --git a/test/CodeGen/AMDGPU/insert_subreg.ll b/test/CodeGen/AMDGPU/insert_subreg.ll index 4a5e8869c2df..e895f27c886d 100644 --- a/test/CodeGen/AMDGPU/insert_subreg.ll +++ b/test/CodeGen/AMDGPU/insert_subreg.ll @@ -6,7 +6,7 @@ ; Make sure this doesn't crash ; CHECK-LABEL: test: -define void @test(i64 addrspace(1)* %out) { +define amdgpu_kernel void @test(i64 addrspace(1)* %out) { entry: %tmp0 = alloca [16 x i32] %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32 diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll index 65ac693a4f44..6391b6b5407b 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-NO-TONGA %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-TONGA %s ; FIXME: Broken on evergreen ; FIXME: For some reason the 8 and 16 vectors are being stored as @@ -18,56 +18,56 @@ ; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000 ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]: -define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v4f32_1: -define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v4f32_2: -define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v4f32_3: -define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v4i32_0: -define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { +define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { %vecins = insertelement <4 x i32> %a, i32 999, i32 0 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v3f32_1: -define void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v3f32_2: -define void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}insertelement_v3f32_3: -define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 ret void @@ -78,7 +78,7 @@ define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> % define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind { %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 - %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 false, i1 false, i1 false, i1 false, i1 true) ret <4 x float> %tmp2 } @@ -86,7 +86,7 @@ define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind { ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 ret void @@ -97,7 +97,7 @@ define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x fl ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: ; GCN-DAG: buffer_store_dword v -define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 ret void @@ -107,7 +107,7 @@ define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x fl ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]: -define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 ret void @@ -117,7 +117,7 @@ define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x fl ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 ret void @@ -129,7 +129,7 @@ define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x fl ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 ret void @@ -138,7 +138,7 @@ define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32: ; GCN: v_movreld_b32 ; GCN: buffer_store_dwordx2 -define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { %vecins = insertelement <2 x i32> %a, i32 5, i32 %b store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 ret void @@ -148,7 +148,7 @@ define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: ; GCN-DAG: buffer_store_dword v -define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { %vecins = insertelement <3 x i32> %a, i32 5, i32 %b store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 ret void @@ -159,7 +159,7 @@ define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]] ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind { %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 ret void @@ -169,7 +169,7 @@ define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ; GCN: v_movreld_b32 ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { %vecins = insertelement <8 x i32> %a, i32 5, i32 %b store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 ret void @@ -181,21 +181,21 @@ define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { %vecins = insertelement <16 x i32> %a, i32 5, i32 %b store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v2i16: -define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { %vecins = insertelement <2 x i16> %a, i16 5, i32 %b store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i16: -define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { %vecins = insertelement <3 x i16> %a, i16 5, i32 %b store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8 ret void @@ -207,25 +207,22 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> ; GCN: buffer_load_ushort v{{[0-9]+}}, off ; GCN: buffer_load_ushort v{{[0-9]+}}, off -; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 8{{$}} ; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}} ; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}} -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:14 +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:10 +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8 ; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} ; GCN: s_waitcnt -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; GCN: buffer_load_dwordx2 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off -define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { %vecins = insertelement <4 x i16> %a, i16 5, i32 %b store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8 ret void @@ -235,16 +232,17 @@ define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ; GCN: buffer_load_ubyte v{{[0-9]+}}, off ; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN-NO-TONGA: buffer_load_ubyte +; GCN-NO-TONGA: buffer_load_ubyte +; GCN-TONGA: buffer_load_ushort ; GCN: buffer_store_short v{{[0-9]+}}, off -define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { %vecins = insertelement <2 x i8> %a, i8 5, i32 %b store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 ret void @@ -255,19 +253,19 @@ define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a ; GCN: buffer_load_ubyte v{{[0-9]+}}, off ; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 -; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} - -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN-NO-TONGA: buffer_load_ubyte +; GCN-NO-TONGA: buffer_load_ubyte +; GCN-NO-TONGA: buffer_load_ubyte +; GCN-TONGA: buffer_load_ushort +; GCN-TONGA: buffer_load_ubyte ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off -define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind { %vecins = insertelement <3 x i8> %a, i8 5, i32 %b store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 ret void @@ -279,34 +277,35 @@ define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a ; GCN: buffer_load_ubyte v{{[0-9]+}}, off ; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:3 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:7 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN-NO-TONGA: buffer_load_ubyte +; GCN-NO-TONGA: buffer_load_ubyte +; GCN-NO-TONGA: buffer_load_ubyte +; GCN-NO-TONGA: buffer_load_ubyte +; GCN-TONGA: buffer_load_dword ; GCN: buffer_store_dword v{{[0-9]+}}, off -define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v8i8: -define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { %vecins = insertelement <8 x i8> %a, i8 5, i32 %b store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8: -define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { %vecins = insertelement <16 x i8> %a, i8 5, i32 %b store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 ret void @@ -315,7 +314,7 @@ define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that ; the compiler doesn't crash. ; GCN-LABEL: {{^}}insert_split_bb: -define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { +define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { entry: %0 = insertelement <2 x i32> undef, i32 %a, i32 0 %1 = icmp eq i32 %a, 0 @@ -362,7 +361,7 @@ endif: ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { %vecins = insertelement <2 x double> %a, double 8.0, i32 %b store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 ret void @@ -375,14 +374,14 @@ define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x d ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { %vecins = insertelement <2 x i64> %a, i64 5, i32 %b store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64: -define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { %vecins = insertelement <3 x i64> %a, i64 5, i32 %b store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 ret void @@ -396,15 +395,15 @@ define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> ; Stack store -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} ; Write element ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; Stack reload -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} ; Store result ; GCN: buffer_store_dwordx4 @@ -412,7 +411,7 @@ define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> ; GCN: s_endpgm ; GCN: ScratchSize: 64 -define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { %vecins = insertelement <4 x double> %a, double 8.0, i32 %b store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 ret void @@ -421,17 +420,17 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64: ; GCN-DAG: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} -; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}} ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}} ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 @@ -439,10 +438,13 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm ; GCN: ScratchSize: 128 -define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { %vecins = insertelement <8 x double> %a, double 8.0, i32 %b store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 ret void } -declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone +declare <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll new file mode 100644 index 000000000000..a3f82b8a0117 --- /dev/null +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -0,0 +1,470 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s + +; GCN-LABEL: {{^}}s_insertelement_v2i16_0: +; GCN: s_load_dword [[VEC:s[0-9]+]] + +; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}} + +; GFX9-NOT: lshr +; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]] +define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg: +; GCN: s_load_dword [[ELT0:s[0-9]+]] +; GCN: s_load_dword [[VEC:s[0-9]+]] + +; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] + +; GFX9-NOT: [[ELT0]] +; GFX9-NOT: [[VEC]] +; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]] +define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg: +; GCN: s_load_dword [[ELT0:s[0-9]+]] +; GCN: s_load_dword [[VEC:s[0-9]+]] + +; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16 +; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; CIVI-DAG: ; use [[SHR]] + +; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 +; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; GFX9-DAG: ; use [[ELT1]] +define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %elt1 = extractelement <2 x i16> %vec, i32 1 + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + %use1 = zext i16 %elt1 to i32 + call void asm sideeffect "; use $0", "s"(i32 %use1) #0 + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi: +; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] +; GCN: s_load_dword [[VEC:s[0-9]+]] + +; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]] + +; GFX9-NOT: [[ELT0]] +; GFX9-NOT: [[VEC]] +; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]] +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %elt.hi = lshr i32 %elt.arg, 16 + %elt = trunc i32 %elt.hi to i16 + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1: +; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] +; GCN: s_load_dword [[VEC:s[0-9]+]], + +; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] + +; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16 +; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]] +; GFX9: ; use [[ELT1]] +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %elt.hi = lshr i32 %elt.arg, 16 + %elt = trunc i32 %elt.hi to i16 + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + %use1 = zext i16 %elt to i32 + call void asm sideeffect "; use $0", "s"(i32 %use1) #0 + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1: +; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] +; GCN: s_load_dword [[VEC:s[0-9]+]], + +; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16 +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] + +; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16 +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] +; GFX9: ; use [[ELT_HI]] +; GFX9: ; use [[VEC_HI]] +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %elt.hi = lshr i32 %elt.arg, 16 + %elt = trunc i32 %elt.hi to i16 + %vec.hi = extractelement <2 x i16> %vec, i32 1 + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + %use1 = zext i16 %elt to i32 + %vec.hi.use1 = zext i16 %vec.hi to i32 + + call void asm sideeffect "; use $0", "s"(i32 %use1) #0 + call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0 + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2i16_1: +; GCN: s_load_dword [[VEC:s[0-9]+]] + +; GCN-NOT: s_lshr + +; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}} +; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000 + +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7 +define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg: +; GCN: s_load_dword [[ELT1:s[0-9]+]] +; GCN: s_load_dword [[VEC:s[0-9]+]] + +; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}} +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] + +; GCN-NOT: shlr +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]] +define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2f16_0: +; GCN: s_load_dword [[VEC:s[0-9]+]] +; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC:s[0-9]+]], 0xffff0000 +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x4500 + +; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]] +define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { + %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr + %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 + store <2 x half> %vecins, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2f16_1: +; GFX9: s_load_dword [[VEC:s[0-9]+]] +; GCN-NOT: s_lshr + +; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}} +; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000 + +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500 +define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { + %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr + %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 + store <2 x half> %vecins, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2i16_0: +; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] +; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e7, [[ELT1]] + +; GFX9-DAG: s_movk_i32 [[ELT0:s[0-9]+]], 0x3e7{{$}} +; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} +; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] +define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2i16_0_reghi: +; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] +; GCN-DAG: s_load_dword [[ELT0:s[0-9]+]] + +; CIVI-DAG: s_lshr_b32 [[ELT0_SHIFT:s[0-9]+]], [[ELT0]], 16 +; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]] + +; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} +; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]] +; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]] + +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] +define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %elt.hi = lshr i32 %elt.arg, 16 + %elt = trunc i32 %elt.hi to i16 + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2i16_0_inlineimm: +; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] + +; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]] + +; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} +; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]] + +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] +define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 53, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0 + +; GCN-LABEL: {{^}}v_insertelement_v2i16_1: +; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] +; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]] + +; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 +; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] + +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] +define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2i16_1_inlineimm: +; GCN: flat_load_dword [[VEC:v[0-9]+]] +; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] +define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2f16_0: +; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] + +; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x4500, [[ELT1]] + +; GFX9-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0x4500{{$}} +; GFX9-DAG: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]] + +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] +define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep + %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 + store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2f16_0_inlineimm: +; GCN: flat_load_dword [[VEC:v[0-9]+]] + +; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] +; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]] + +; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53 +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] +define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep + %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0 + store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2f16_1: +; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] +; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]] + +; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500 +; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] + +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] +define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep + %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 + store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2f16_1_inlineimm: +; GCN: flat_load_dword [[VEC:v[0-9]+]] +; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] +define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep + %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1 + store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep + ret void +} + +; FIXME: Enable for others when argument load not split +; GCN-LABEL: {{^}}s_insertelement_v2i16_dynamic: +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 +; GCN: s_load_dword [[IDX:s[0-9]+]] +; GCN: s_load_dword [[VEC:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VVEC:v[0-9]+]], [[VEC]] +; GCN-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16 +; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 { + %idx = load volatile i32, i32 addrspace(2)* %idx.ptr + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_sgpr: +; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] +; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 +; GCN-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16 +; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: +; GCN: flat_load_dword [[IDX:v[0-9]+]] +; GCN: flat_load_dword [[VEC:v[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 + +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] +; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] + +; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] +; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] + +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %idx = load i32, i32 addrspace(1)* %idx.gep + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: +; GCN: flat_load_dword [[IDX:v[0-9]+]] +; GCN: flat_load_dword [[VEC:v[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 + +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] +; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] + +; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] +; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] + +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext + %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %idx = load i32, i32 addrspace(1)* %idx.gep + %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep + %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx + store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir index 85cd903a405d..1479303712d0 100644 --- a/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -1,14 +1,46 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI # RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9 --- | - define void @div_fmas() { ret void } - define void @s_getreg() { ret void } - define void @s_setreg() { ret void } - define void @vmem_gt_8dw_store() { ret void } - define void @readwrite_lane() { ret void } - define void @rfe() { ret void } + define amdgpu_kernel void @div_fmas() { ret void } + define amdgpu_kernel void @s_getreg() { ret void } + define amdgpu_kernel void @s_setreg() { ret void } + define amdgpu_kernel void @vmem_gt_8dw_store() { ret void } + define amdgpu_kernel void @readwrite_lane() { ret void } + define amdgpu_kernel void @rfe() { ret void } + define amdgpu_kernel void @s_mov_fed_b32() { ret void } + define amdgpu_kernel void @s_movrel() { ret void } + define amdgpu_kernel void @v_interp() { ret void } + + define amdgpu_kernel void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) { + entry: + %A.addr = alloca i32 addrspace(1)*, align 4 + store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !5, metadata !11), !dbg !12 + ret void + } + + declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!3, !4} + + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 268929)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) + !1 = !DIFile(filename: "test01.cl", directory: "/dev/null") + !2 = !{} + !3 = !{i32 2, !"Dwarf Version", i32 2} + !4 = !{i32 2, !"Debug Info Version", i32 3} + !5 = !DILocalVariable(name: "A", arg: 1, scope: !6, file: !1, line: 1, type: !9) + !6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) + !7 = !DISubroutineType(types: !8) + !8 = !{null, !9} + !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 32) + !10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) + !11 = !DIExpression() + !12 = !DILocation(line: 1, column: 30, scope: !6) + ... --- # GCN-LABEL: name: div_fmas @@ -331,3 +363,185 @@ body: | S_ENDPGM ... + +... +--- + +# GCN-LABEL: name: s_mov_fed_b32 + +# GCN-LABEL: bb.0: +# GCN: S_MOV_FED_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOV_B32 + +# GCN-LABEL: bb.1: +# GCN: S_MOV_FED_B32 +# GFX9: S_NOP +# GCN-NEXT: V_MOV_B32 +name: s_mov_fed_b32 + +body: | + bb.0: + successors: %bb.1 + %sgpr0 = S_MOV_FED_B32 %sgpr0 + %sgpr0 = S_MOV_B32 %sgpr0 + S_BRANCH %bb.1 + + bb.1: + %sgpr0 = S_MOV_FED_B32 %sgpr0 + %vgpr0 = V_MOV_B32_e32 %sgpr0, implicit %exec + S_ENDPGM + +... + +... +--- + +# GCN-LABEL: name: s_movrel + +# GCN-LABEL: bb.0: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOVRELS_B32 + +# GCN-LABEL: bb.1: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOVRELS_B64 + +# GCN-LABEL: bb.2: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOVRELD_B32 + +# GCN-LABEL: bb.3: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOVRELD_B64 + +name: s_movrel + +body: | + bb.0: + successors: %bb.1 + %m0 = S_MOV_B32 0 + %sgpr0 = S_MOVRELS_B32 %sgpr0, implicit %m0 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %m0 = S_MOV_B32 0 + %sgpr0_sgpr1 = S_MOVRELS_B64 %sgpr0_sgpr1, implicit %m0 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + %m0 = S_MOV_B32 0 + %sgpr0 = S_MOVRELD_B32 %sgpr0, implicit %m0 + S_BRANCH %bb.3 + + bb.3: + %m0 = S_MOV_B32 0 + %sgpr0_sgpr1 = S_MOVRELD_B64 %sgpr0_sgpr1, implicit %m0 + S_ENDPGM +... + +... +--- + +# GCN-LABEL: name: v_interp + +# GCN-LABEL: bb.0: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: V_INTERP_P1_F32 + +# GCN-LABEL: bb.1: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: V_INTERP_P2_F32 + +# GCN-LABEL: bb.2: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: V_INTERP_P1_F32_16bank + +# GCN-LABEL: bb.3: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: V_INTERP_MOV_F32 + +name: v_interp + +body: | + bb.0: + successors: %bb.1 + %m0 = S_MOV_B32 0 + %vgpr0 = V_INTERP_P1_F32 %vgpr0, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %m0 = S_MOV_B32 0 + %vgpr0 = V_INTERP_P2_F32 %vgpr0, %vgpr1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + %m0 = S_MOV_B32 0 + %vgpr0 = V_INTERP_P1_F32_16bank %vgpr0, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.3 + + bb.3: + %m0 = S_MOV_B32 0 + %vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit %m0, implicit %exec + S_ENDPGM +... +--- +name: mov_fed_hazard_crash_on_dbg_value +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%sgpr4_sgpr5' } + - { reg: '%sgpr6_sgpr7' } + - { reg: '%sgpr9' } + - { reg: '%sgpr0_sgpr1_sgpr2_sgpr3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: 0 + maxAlignment: 8 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +stack: + - { id: 0, name: A.addr, offset: 0, size: 8, alignment: 8, local-offset: 0 } + - { id: 1, offset: 8, size: 4, alignment: 4 } +body: | + bb.0.entry: + liveins: %sgpr4_sgpr5, %sgpr6_sgpr7, %sgpr9, %sgpr0_sgpr1_sgpr2_sgpr3 + + %flat_scr_lo = S_ADD_U32 %sgpr6, %sgpr9, implicit-def %scc + %flat_scr_hi = S_ADDC_U32 %sgpr7, 0, implicit-def %scc, implicit %scc + DBG_VALUE _, 2, !5, !11, debug-location !12 + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + dead %sgpr6_sgpr7 = KILL %sgpr4_sgpr5 + %sgpr8 = S_MOV_B32 %sgpr5 + %vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec + BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr + 4) + %sgpr8 = S_MOV_B32 %sgpr4, implicit killed %sgpr4_sgpr5 + %vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec + BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr) + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/internalize.ll b/test/CodeGen/AMDGPU/internalize.ll new file mode 100644 index 000000000000..968b1d326a76 --- /dev/null +++ b/test/CodeGen/AMDGPU/internalize.ll @@ -0,0 +1,35 @@ +; RUN: opt -O1 -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-internalize-symbols < %s | FileCheck %s +; CHECK-NOT: unused +; CHECK-NOT: foo_used +; CHECK: gvar_used +; CHECK: main_kernel + +@gvar_unused = addrspace(1) global i32 undef, align 4 +@gvar_used = addrspace(1) global i32 undef, align 4 + +; Function Attrs: alwaysinline nounwind +define amdgpu_kernel void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 { +entry: + store i32 1, i32 addrspace(1)* %out + ret void +} + +; Function Attrs: alwaysinline nounwind +define amdgpu_kernel void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 { +entry: + store i32 %tid, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @main_kernel() { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + tail call void @foo_used(i32 addrspace(1)* @gvar_used, i32 %tid) nounwind + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } + +attributes #1 = { alwaysinline nounwind } diff --git a/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll index c29434f5eca2..31f2fbc919aa 100644 --- a/test/CodeGen/AMDGPU/invalid-addrspacecast.ll +++ b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: error: :0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast -define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) { +define amdgpu_kernel void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)* store volatile i32 0, i32 addrspace(1)* %stof ret void diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll deleted file mode 100644 index 49c314fbc5d0..000000000000 --- a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll +++ /dev/null @@ -1,6 +0,0 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s -; check llc does not crash for invalid opencl version metadata - -; CHECK: { amd.MDVersion: [ 2, 0 ] } - -!opencl.ocl.version = !{} diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll deleted file mode 100644 index 1f5e8be531dc..000000000000 --- a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll +++ /dev/null @@ -1,7 +0,0 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s -; check llc does not crash for invalid opencl version metadata - -; CHECK: { amd.MDVersion: [ 2, 0 ] } - -!opencl.ocl.version = !{!0} -!0 = !{} diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll deleted file mode 100644 index b77551e268a0..000000000000 --- a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll +++ /dev/null @@ -1,7 +0,0 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s -; check llc does not crash for invalid opencl version metadata - -; CHECK: { amd.MDVersion: [ 2, 0 ] } - -!opencl.ocl.version = !{!0} -!0 = !{i32 1} diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll index 45a061067cfc..5cd965d2fa9c 100644 --- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll +++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll @@ -10,7 +10,7 @@ ; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]], ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b ; GCN: buffer_store_dword [[K]], [[PTR]] -define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 { +define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 { %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0 %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1 store i16 123, i16 addrspace(1)* %ptr, align 4 @@ -22,7 +22,7 @@ define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 add ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}} ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b ; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]: -define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 { +define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 { %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0 %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1 store i16 123, i16 addrspace(1)* %ptr, align 4 diff --git a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir index 66182d092895..bc1dafe0ea1e 100644 --- a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir +++ b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir @@ -1,7 +1,7 @@ # RUN: llc -run-pass block-placement -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s --- | - define void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { + define amdgpu_kernel void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { entry: br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 diff --git a/test/CodeGen/AMDGPU/kcache-fold.ll b/test/CodeGen/AMDGPU/kcache-fold.ll index 43448fbd7b33..37dd977ae216 100644 --- a/test/CodeGen/AMDGPU/kcache-fold.ll +++ b/test/CodeGen/AMDGPU/kcache-fold.ll @@ -1,100 +1,112 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s ; CHECK: {{^}}main1: ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}} -define void @main1() { +define amdgpu_kernel void @main1() #0 { main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = extractelement <4 x float> %0, i32 0 - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %3 = extractelement <4 x float> %2, i32 0 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %5 = extractelement <4 x float> %4, i32 0 - %6 = fcmp ogt float %1, 0.000000e+00 - %7 = select i1 %6, float %3, float %5 - %8 = load <4 x float>, <4 x float> addrspace(8)* null - %9 = extractelement <4 x float> %8, i32 1 - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %11 = extractelement <4 x float> %10, i32 1 - %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %13 = extractelement <4 x float> %12, i32 1 - %14 = fcmp ogt float %9, 0.000000e+00 - %15 = select i1 %14, float %11, float %13 - %16 = load <4 x float>, <4 x float> addrspace(8)* null - %17 = extractelement <4 x float> %16, i32 2 - %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %19 = extractelement <4 x float> %18, i32 2 - %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %21 = extractelement <4 x float> %20, i32 2 - %22 = fcmp ogt float %17, 0.000000e+00 - %23 = select i1 %22, float %19, float %21 - %24 = load <4 x float>, <4 x float> addrspace(8)* null - %25 = extractelement <4 x float> %24, i32 3 - %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %27 = extractelement <4 x float> %26, i32 3 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %29 = extractelement <4 x float> %28, i32 3 - %30 = fcmp ogt float %25, 0.000000e+00 - %31 = select i1 %30, float %27, float %29 - %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00) - %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00) - %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00) - %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00) - %36 = insertelement <4 x float> undef, float %32, i32 0 - %37 = insertelement <4 x float> %36, float %33, i32 1 - %38 = insertelement <4 x float> %37, float %34, i32 2 - %39 = insertelement <4 x float> %38, float %35, i32 3 - call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0) + %tmp = load <4 x float>, <4 x float> addrspace(8)* null + %tmp7 = extractelement <4 x float> %tmp, i32 0 + %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp9 = extractelement <4 x float> %tmp8, i32 0 + %tmp10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp11 = extractelement <4 x float> %tmp10, i32 0 + %tmp12 = fcmp ogt float %tmp7, 0.000000e+00 + %tmp13 = select i1 %tmp12, float %tmp9, float %tmp11 + %tmp14 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp15 = extractelement <4 x float> %tmp14, i32 1 + %tmp16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp17 = extractelement <4 x float> %tmp16, i32 1 + %tmp18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp19 = extractelement <4 x float> %tmp18, i32 1 + %tmp20 = fcmp ogt float %tmp15, 0.000000e+00 + %tmp21 = select i1 %tmp20, float %tmp17, float %tmp19 + %tmp22 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp23 = extractelement <4 x float> %tmp22, i32 2 + %tmp24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp25 = extractelement <4 x float> %tmp24, i32 2 + %tmp26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp27 = extractelement <4 x float> %tmp26, i32 2 + %tmp28 = fcmp ogt float %tmp23, 0.000000e+00 + %tmp29 = select i1 %tmp28, float %tmp25, float %tmp27 + %tmp30 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp31 = extractelement <4 x float> %tmp30, i32 3 + %tmp32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp33 = extractelement <4 x float> %tmp32, i32 3 + %tmp34 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp35 = extractelement <4 x float> %tmp34, i32 3 + %tmp36 = fcmp ogt float %tmp31, 0.000000e+00 + %tmp37 = select i1 %tmp36, float %tmp33, float %tmp35 + %max.0.i = call float @llvm.maxnum.f32(float %tmp13, float 0.000000e+00) + %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00) + %max.0.i5 = call float @llvm.maxnum.f32(float %tmp21, float 0.000000e+00) + %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00) + %max.0.i3 = call float @llvm.maxnum.f32(float %tmp29, float 0.000000e+00) + %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00) + %max.0.i1 = call float @llvm.maxnum.f32(float %tmp37, float 0.000000e+00) + %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) + %tmp38 = insertelement <4 x float> undef, float %clamp.i, i32 0 + %tmp39 = insertelement <4 x float> %tmp38, float %clamp.i6, i32 1 + %tmp40 = insertelement <4 x float> %tmp39, float %clamp.i4, i32 2 + %tmp41 = insertelement <4 x float> %tmp40, float %clamp.i2, i32 3 + call void @llvm.r600.store.swizzle(<4 x float> %tmp41, i32 0, i32 0) ret void } ; CHECK: {{^}}main2: ; CHECK-NOT: MOV -define void @main2() { +define amdgpu_kernel void @main2() #0 { main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = extractelement <4 x float> %0, i32 0 - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %3 = extractelement <4 x float> %2, i32 0 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %5 = extractelement <4 x float> %4, i32 1 - %6 = fcmp ogt float %1, 0.000000e+00 - %7 = select i1 %6, float %3, float %5 - %8 = load <4 x float>, <4 x float> addrspace(8)* null - %9 = extractelement <4 x float> %8, i32 1 - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %11 = extractelement <4 x float> %10, i32 0 - %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %13 = extractelement <4 x float> %12, i32 1 - %14 = fcmp ogt float %9, 0.000000e+00 - %15 = select i1 %14, float %11, float %13 - %16 = load <4 x float>, <4 x float> addrspace(8)* null - %17 = extractelement <4 x float> %16, i32 2 - %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %19 = extractelement <4 x float> %18, i32 3 - %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %21 = extractelement <4 x float> %20, i32 2 - %22 = fcmp ogt float %17, 0.000000e+00 - %23 = select i1 %22, float %19, float %21 - %24 = load <4 x float>, <4 x float> addrspace(8)* null - %25 = extractelement <4 x float> %24, i32 3 - %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %27 = extractelement <4 x float> %26, i32 3 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %29 = extractelement <4 x float> %28, i32 2 - %30 = fcmp ogt float %25, 0.000000e+00 - %31 = select i1 %30, float %27, float %29 - %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00) - %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00) - %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00) - %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00) - %36 = insertelement <4 x float> undef, float %32, i32 0 - %37 = insertelement <4 x float> %36, float %33, i32 1 - %38 = insertelement <4 x float> %37, float %34, i32 2 - %39 = insertelement <4 x float> %38, float %35, i32 3 - call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0) + %tmp = load <4 x float>, <4 x float> addrspace(8)* null + %tmp7 = extractelement <4 x float> %tmp, i32 0 + %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp9 = extractelement <4 x float> %tmp8, i32 0 + %tmp10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp11 = extractelement <4 x float> %tmp10, i32 1 + %tmp12 = fcmp ogt float %tmp7, 0.000000e+00 + %tmp13 = select i1 %tmp12, float %tmp9, float %tmp11 + %tmp14 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp15 = extractelement <4 x float> %tmp14, i32 1 + %tmp16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp17 = extractelement <4 x float> %tmp16, i32 0 + %tmp18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp19 = extractelement <4 x float> %tmp18, i32 1 + %tmp20 = fcmp ogt float %tmp15, 0.000000e+00 + %tmp21 = select i1 %tmp20, float %tmp17, float %tmp19 + %tmp22 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp23 = extractelement <4 x float> %tmp22, i32 2 + %tmp24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp25 = extractelement <4 x float> %tmp24, i32 3 + %tmp26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp27 = extractelement <4 x float> %tmp26, i32 2 + %tmp28 = fcmp ogt float %tmp23, 0.000000e+00 + %tmp29 = select i1 %tmp28, float %tmp25, float %tmp27 + %tmp30 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp31 = extractelement <4 x float> %tmp30, i32 3 + %tmp32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp33 = extractelement <4 x float> %tmp32, i32 3 + %tmp34 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp35 = extractelement <4 x float> %tmp34, i32 2 + %tmp36 = fcmp ogt float %tmp31, 0.000000e+00 + %tmp37 = select i1 %tmp36, float %tmp33, float %tmp35 + %max.0.i = call float @llvm.maxnum.f32(float %tmp13, float 0.000000e+00) + %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00) + %max.0.i5 = call float @llvm.maxnum.f32(float %tmp21, float 0.000000e+00) + %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00) + %max.0.i3 = call float @llvm.maxnum.f32(float %tmp29, float 0.000000e+00) + %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00) + %max.0.i1 = call float @llvm.maxnum.f32(float %tmp37, float 0.000000e+00) + %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) + %tmp38 = insertelement <4 x float> undef, float %clamp.i, i32 0 + %tmp39 = insertelement <4 x float> %tmp38, float %clamp.i6, i32 1 + %tmp40 = insertelement <4 x float> %tmp39, float %clamp.i4, i32 2 + %tmp41 = insertelement <4 x float> %tmp40, float %clamp.i2, i32 3 + call void @llvm.r600.store.swizzle(<4 x float> %tmp41, i32 0, i32 0) ret void } -declare float @llvm.AMDGPU.clamp.f32(float, float, float) readnone -declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) +declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #0 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll index 21c92dbc9098..8e358ef2804f 100644 --- a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll +++ b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll @@ -4,40 +4,40 @@ ; alignment of the stack ; CHECK-LABEL: {{^}}no_args: -; CHECK: ScratchSize: 8{{$}} -define void @no_args() { +; CHECK: ScratchSize: 5{{$}} +define amdgpu_kernel void @no_args() { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void } ; CHECK-LABEL: {{^}}force_align32: -; CHECK: ScratchSize: 8{{$}} -define void @force_align32(<8 x i32>) { +; CHECK: ScratchSize: 5{{$}} +define amdgpu_kernel void @force_align32(<8 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void } ; CHECK-LABEL: {{^}}force_align64: -; CHECK: ScratchSize: 8{{$}} -define void @force_align64(<16 x i32>) { +; CHECK: ScratchSize: 5{{$}} +define amdgpu_kernel void @force_align64(<16 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void } ; CHECK-LABEL: {{^}}force_align128: -; CHECK: ScratchSize: 8{{$}} -define void @force_align128(<32 x i32>) { +; CHECK: ScratchSize: 5{{$}} +define amdgpu_kernel void @force_align128(<32 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void } ; CHECK-LABEL: {{^}}force_align256: -; CHECK: ScratchSize: 8{{$}} -define void @force_align256(<64 x i32>) { +; CHECK: ScratchSize: 5{{$}} +define amdgpu_kernel void @force_align256(<64 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca ret void diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll index 95a68319f8af..6fa26cb38793 100644 --- a/test/CodeGen/AMDGPU/kernel-args.ll +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -17,7 +17,7 @@ ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { +define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { entry: %0 = zext i8 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -36,7 +36,7 @@ entry: ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { +define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { entry: %0 = zext i8 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -55,7 +55,7 @@ entry: ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { +define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { entry: %0 = sext i8 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -75,7 +75,7 @@ entry: ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { +define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { entry: %0 = zext i16 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -94,7 +94,7 @@ entry: ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { +define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { entry: %0 = zext i16 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -113,7 +113,7 @@ entry: ; FIXME: Should be using s_load_dword ; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]] -define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { +define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { entry: %0 = sext i16 %in to i32 store i32 %0, i32 addrspace(1)* %out, align 4 @@ -126,7 +126,7 @@ entry: ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 -define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { +define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { entry: store i32 %in, i32 addrspace(1)* %out, align 4 ret void @@ -138,7 +138,7 @@ entry: ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 -define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { +define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { entry: store float %in, float addrspace(1)* %out, align 4 ret void @@ -152,7 +152,7 @@ entry: ; MESA-GCN: buffer_load_ubyte ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte -define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { +define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { entry: store <2 x i8> %in, <2 x i8> addrspace(1)* %out ret void @@ -166,7 +166,7 @@ entry: ; MESA-GCN: buffer_load_ushort ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort -define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { +define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(1)* %out ret void @@ -179,7 +179,7 @@ entry: ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 -define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { +define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { entry: store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 ret void @@ -192,7 +192,7 @@ entry: ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 -define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { +define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { entry: store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 ret void @@ -209,7 +209,7 @@ entry: ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte -define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { +define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { entry: store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 ret void @@ -226,7 +226,7 @@ entry: ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort -define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { +define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { entry: store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 ret void @@ -239,7 +239,7 @@ entry: ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 -define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { +define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { entry: store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 ret void @@ -253,7 +253,7 @@ entry: ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 -define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { +define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { entry: store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 ret void @@ -273,7 +273,7 @@ entry: ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte -define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { +define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(1)* %out ret void @@ -293,7 +293,7 @@ entry: ; HSA-GCN: flat_load_ushort ; HSA-GCN: flat_load_ushort ; HSA-GCN: flat_load_ushort -define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { +define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { entry: store <4 x i16> %in, <4 x i16> addrspace(1)* %out ret void @@ -308,7 +308,7 @@ entry: ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 -define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { +define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { entry: store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -323,7 +323,7 @@ entry: ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 -define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { +define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { entry: store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 ret void @@ -354,7 +354,7 @@ entry: ; HSA-GCN: float_load_ubyte ; HSA-GCN: float_load_ubyte ; HSA-GCN: float_load_ubyte -define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { +define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { entry: store <8 x i8> %in, <8 x i8> addrspace(1)* %out ret void @@ -386,7 +386,7 @@ entry: ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort -define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { +define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { entry: store <8 x i16> %in, <8 x i16> addrspace(1)* %out ret void @@ -405,7 +405,7 @@ entry: ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 -define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { +define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { entry: store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 ret void @@ -422,7 +422,7 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 -define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { +define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { entry: store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 ret void @@ -478,7 +478,7 @@ entry: ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte ; HSA-VI: flat_load_ubyte -define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { +define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { entry: store <16 x i8> %in, <16 x i8> addrspace(1)* %out ret void @@ -534,7 +534,7 @@ entry: ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ushort -define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { +define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { entry: store <16 x i16> %in, <16 x i16> addrspace(1)* %out ret void @@ -561,7 +561,7 @@ entry: ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { +define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { entry: store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 ret void @@ -588,7 +588,7 @@ entry: ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { +define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { entry: store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 ret void @@ -599,7 +599,7 @@ entry: ; MESA-GCN: s_load_dwordx2 ; MESA-GCN: buffer_store_dwordx2 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 -define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { +define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { store i64 %a, i64 addrspace(1)* %out, align 8 ret void } @@ -611,7 +611,7 @@ define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { ; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c ; MESA-GCN: buffer_store_dwordx2 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 -define void @f64_kernel_arg(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { entry: store double %in, double addrspace(1)* %out ret void @@ -621,7 +621,7 @@ entry: ; XGCN: s_load_dwordx2 ; XGCN: s_load_dwordx2 ; XGCN: buffer_store_dwordx2 -; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { +; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { ; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 ; ret void ; } @@ -631,7 +631,7 @@ entry: ; SI: v_and_b32_e32 ; SI: buffer_store_byte ; SI: s_endpgm -define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { store i1 %x, i1 addrspace(1)* %out, align 1 ret void } @@ -640,7 +640,7 @@ define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { ; SI: buffer_load_ubyte ; SI: buffer_store_dword ; SI: s_endpgm -define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { %ext = zext i1 %x to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -650,7 +650,7 @@ define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { ; SI: buffer_load_ubyte ; SI: buffer_store_dwordx2 ; SI: s_endpgm -define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = zext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void @@ -660,7 +660,7 @@ define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { ; SI: buffer_load_ubyte ; SI: buffer_store_dword ; SI: s_endpgm -define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i32 store i32 %ext, i32addrspace(1)* %out, align 4 ret void @@ -672,7 +672,7 @@ define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { ; SI: v_ashrrev_i32 ; SI: buffer_store_dwordx2 ; SI: s_endpgm -define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { +define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll index 4f6dbf9dc2bf..4af37d8da966 100644 --- a/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s @@ -14,6 +15,7 @@ ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1 ; CI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe8f000 ; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000 +; GFX9-DAG: s_mov_b32 s{{[0-9]+}}, 0xe00000 ; GCNHSA: .amd_kernel_code_t @@ -46,7 +48,7 @@ ; Scratch size = alloca size + emergency stack slot ; ALL: ; ScratchSize: 32772 -define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { +define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { %large = alloca [8192 x i32], align 4 %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 store volatile i32 %x, i32* %gep diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll index ea9754a390b6..28b819a6374b 100644 --- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s ; ALL-LABEL: {{^}}large_alloca_pixel_shader: ; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 @@ -7,6 +8,7 @@ ; GCN-DAG: s_mov_b32 s10, -1 ; CI-DAG: s_mov_b32 s11, 0xe8f000 ; VI-DAG: s_mov_b32 s11, 0xe80000 +; GFX9-DAG: s_mov_b32 s11, 0xe00000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen @@ -28,6 +30,7 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { ; GCN-DAG: s_mov_b32 s10, -1 ; CI-DAG: s_mov_b32 s11, 0xe8f000 ; VI-DAG: s_mov_b32 s11, 0xe80000 +; GFX9-DAG: s_mov_b32 s11, 0xe00000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen diff --git a/test/CodeGen/AMDGPU/large-constant-initializer.ll b/test/CodeGen/AMDGPU/large-constant-initializer.ll index 9975b1b7f5cc..c46d68e38ade 100644 --- a/test/CodeGen/AMDGPU/large-constant-initializer.ll +++ b/test/CodeGen/AMDGPU/large-constant-initializer.ll @@ -4,7 +4,7 @@ @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4 -define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind { +define amdgpu_kernel void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind { %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4 %mul12 = mul nsw i32 %val, 7 br i1 undef, label %exit, label %bb diff --git a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll index 906a688febd2..13dd7058c50a 100644 --- a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll +++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll @@ -1,8 +1,10 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s +; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s -; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 +; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 +; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 -define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -22,9 +24,9 @@ entry: ret void } -; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 +; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 -define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { +define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -44,9 +46,9 @@ entry: ret void } -; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 +; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 -define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { +define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -66,9 +68,10 @@ entry: ret void } -; CHECK-LABEL: @occupancy_0( -; CHECK: alloca [5 x i32] -define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { +; ALL-LABEL: @occupancy_0( +; CI-NOT: alloca [5 x i32] +; SI: alloca [5 x i32] +define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -88,9 +91,10 @@ entry: ret void } -; CHECK-LABEL: @occupancy_max( -; CHECK: alloca [5 x i32] -define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { +; ALL-LABEL: @occupancy_max( +; CI-NOT: alloca [5 x i32] +; SI: alloca [5 x i32] +define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -110,9 +114,11 @@ entry: ret void } -; CHECK-LABEL: @occupancy_6( -; CHECK-NOT: alloca -define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { +; SI-LABEL: @occupancy_6( +; CI-LABEL: @occupancy_6( +; SI: alloca +; CI-NOT: alloca +define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [42 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -134,9 +140,9 @@ entry: ret void } -; CHECK-LABEL: @occupancy_6_over( -; CHECK: alloca [43 x i8] -define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { +; ALL-LABEL: @occupancy_6_over( +; ALL: alloca [43 x i8] +define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [43 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -158,9 +164,11 @@ entry: ret void } -; CHECK-LABEL: @occupancy_8( -; CHECK-NOT: alloca -define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { +; SI-LABEL: @occupancy_8( +; CI-LABEL: @occupancy_8( +; SI: alloca +; CI-NOT: alloca +define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { entry: %stack = alloca [32 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -182,9 +190,9 @@ entry: ret void } -; CHECK-LABEL: @occupancy_8_over( -; CHECK: alloca [33 x i8] -define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { +; ALL-LABEL: @occupancy_8_over( +; ALL: alloca [33 x i8] +define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { entry: %stack = alloca [33 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -206,9 +214,11 @@ entry: ret void } -; CHECK-LABEL: @occupancy_9( -; CHECK-NOT: alloca -define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { +; SI-LABEL: @occupancy_9( +; CI-LABEL: @occupancy_9( +; SI: alloca +; CI-NOT: alloca +define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { entry: %stack = alloca [28 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 @@ -230,9 +240,9 @@ entry: ret void } -; CHECK-LABEL: @occupancy_9_over( -; CHECK: alloca [29 x i8] -define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { +; ALL-LABEL: @occupancy_9_over( +; ALL: alloca [29 x i8] +define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { entry: %stack = alloca [29 x i8], align 4 %tmp = load i8, i8 addrspace(1)* %in, align 1 diff --git a/test/CodeGen/AMDGPU/lds-alignment.ll b/test/CodeGen/AMDGPU/lds-alignment.ll index 99334585e589..c23dea2b6b76 100644 --- a/test/CodeGen/AMDGPU/lds-alignment.ll +++ b/test/CodeGen/AMDGPU/lds-alignment.ll @@ -15,7 +15,7 @@ declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace ; HSA-LABEL: {{^}}test_no_round_size_1: ; HSA: workgroup_group_segment_byte_size = 38 -define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) @@ -34,7 +34,7 @@ define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) # ; HSA-LABEL: {{^}}test_round_size_2: ; HSA: workgroup_group_segment_byte_size = 86 ; HSA: group_segment_alignment = 4 -define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) @@ -50,7 +50,7 @@ define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { ; HSA-LABEL: {{^}}test_round_size_2_align_8: ; HSA: workgroup_group_segment_byte_size = 86 ; HSA: group_segment_alignment = 4 -define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) @@ -65,7 +65,7 @@ define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* % ; HSA-LABEL: {{^}}test_round_local_lds_and_arg: ; HSA: workgroup_group_segment_byte_size = 38 ; HSA: group_segment_alignment = 4 -define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { +define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) @@ -78,7 +78,7 @@ define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1) ; HSA-LABEL: {{^}}test_round_lds_arg: ; HSA: workgroup_group_segment_byte_size = 0 ; HSA: group_segment_alignment = 4 -define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { +define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false) ret void @@ -88,7 +88,7 @@ define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 ; HSA-LABEL: {{^}}test_high_align_lds_arg: ; HSA: workgroup_group_segment_byte_size = 0 ; HSA: group_segment_alignment = 4 -define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 { +define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 { call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false) ret void @@ -98,7 +98,7 @@ define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: ; HSA: workgroup_group_segment_byte_size = 212 ; HSA: group_segment_alignment = 4 -define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false) @@ -114,7 +114,7 @@ define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addr ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: ; HSA: workgroup_group_segment_byte_size = 216 ; HSA: group_segment_alignment = 4 -define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false) @@ -142,7 +142,7 @@ define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addr ; HSA-LABEL: {{^}}test_round_size_3_order0: ; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) @@ -163,7 +163,7 @@ define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %i ; HSA-LABEL: {{^}}test_round_size_3_order1: ; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) @@ -184,7 +184,7 @@ define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %i ; HSA-LABEL: {{^}}test_round_size_3_order2: ; HSA: workgroup_group_segment_byte_size = 150 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) @@ -205,7 +205,7 @@ define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %i ; HSA-LABEL: {{^}}test_round_size_3_order3: ; HSA: workgroup_group_segment_byte_size = 118 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) @@ -226,7 +226,7 @@ define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %i ; HSA-LABEL: {{^}}test_round_size_3_order4: ; HSA: workgroup_group_segment_byte_size = 142 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) @@ -247,7 +247,7 @@ define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %i ; HSA-LABEL: {{^}}test_round_size_3_order5: ; HSA: workgroup_group_segment_byte_size = 126 ; HSA: group_segment_alignment = 4 -define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll index 9875814b03d3..254673d8a1e4 100644 --- a/test/CodeGen/AMDGPU/lds-initializer.ll +++ b/test/CodeGen/AMDGPU/lds-initializer.ll @@ -5,7 +5,7 @@ @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8] -define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) { +define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) { %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10 %ld = load i32, i32 addrspace(3)* %gep store i32 %ld, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll index 078d6330ce04..1b3eeed3005c 100644 --- a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll +++ b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll @@ -18,7 +18,7 @@ ; GCN: BB0_3: ; GCN-NEXT: s_endpgm -define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 { +define amdgpu_kernel void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge diff --git a/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/test/CodeGen/AMDGPU/lds-oqap-crash.ll index 6ff6fc3d7afc..fff2a9200729 100644 --- a/test/CodeGen/AMDGPU/lds-oqap-crash.ll +++ b/test/CodeGen/AMDGPU/lds-oqap-crash.ll @@ -10,7 +10,7 @@ ; reads and writes are bundled together in the same instruction. ; CHECK: {{^}}lds_crash: -define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) { entry: %0 = load i32, i32 addrspace(3)* %in ; This block needs to be > 115 ISA instructions to hit the bug, diff --git a/test/CodeGen/AMDGPU/lds-output-queue.ll b/test/CodeGen/AMDGPU/lds-output-queue.ll index abe472e423fc..8b7e9e6d6aa8 100644 --- a/test/CodeGen/AMDGPU/lds-output-queue.ll +++ b/test/CodeGen/AMDGPU/lds-output-queue.ll @@ -10,7 +10,7 @@ @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4 -define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { +define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { entry: %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index %1 = load i32, i32 addrspace(3)* %0 @@ -88,7 +88,7 @@ declare void @llvm.r600.group.barrier() nounwind convergent ; CHECK: LDS_READ_RET ; CHECK-NOT: ALU clause ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP -define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 %1 = load i32, i32 addrspace(3)* %0 diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll index 1607713090e3..c65817abd489 100644 --- a/test/CodeGen/AMDGPU/lds-size.ll +++ b/test/CodeGen/AMDGPU/lds-size.ll @@ -14,7 +14,7 @@ ; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only) @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4 -define void @test(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %cond) { entry: %0 = icmp eq i32 %cond, 0 br i1 %0, label %if, label %else diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll index cb5d73fb0d8b..53c1c727a19d 100644 --- a/test/CodeGen/AMDGPU/lds-zero-initializer.ll +++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -5,7 +5,7 @@ @lds = addrspace(3) global [256 x i32] zeroinitializer -define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) { +define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) { %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10 %ld = load i32, i32 addrspace(3)* %gep store i32 %ld, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll index 4244c48d240e..e85a1b690af6 100644 --- a/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll +++ b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll @@ -11,7 +11,7 @@ ; CHECK: {{^}}setcc_expand: ; CHECK: SET ; CHECK-NOT: CND -define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @setcc_expand(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp eq i32 %in, 5 br i1 %0, label %IF, label %ENDIF diff --git a/test/CodeGen/AMDGPU/limit-coalesce.mir b/test/CodeGen/AMDGPU/limit-coalesce.mir new file mode 100644 index 000000000000..106a96e32dc3 --- /dev/null +++ b/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -0,0 +1,71 @@ +# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s + +# Check that coalescer does not create wider register tuple than in source + +# CHECK: - { id: 2, class: vreg_64 } +# CHECK: - { id: 3, class: vreg_64 } +# CHECK: - { id: 4, class: vreg_64 } +# CHECK: - { id: 5, class: vreg_96 } +# CHECK: - { id: 6, class: vreg_96 } +# CHECK: - { id: 7, class: vreg_128 } +# CHECK: - { id: 8, class: vreg_128 } +# No more registers shall be defined +# CHECK-NEXT: liveins: +# CHECK: FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %4, +# CHECK: FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %6, + +--- +name: main +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 1, class: sreg_32_xm0, preferred-register: '%1' } + - { id: 2, class: vreg_64, preferred-register: '%2' } + - { id: 3, class: vreg_64 } + - { id: 4, class: vreg_64 } + - { id: 5, class: vreg_64 } + - { id: 6, class: vreg_96 } + - { id: 7, class: vreg_96 } + - { id: 8, class: vreg_128 } + - { id: 9, class: vreg_128 } +liveins: + - { reg: '%sgpr6', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %sgpr0, %vgpr0_vgpr1 + + %3 = IMPLICIT_DEF + undef %4.sub0 = COPY %sgpr0 + %4.sub1 = COPY %3.sub0 + undef %5.sub0 = COPY %4.sub1 + %5.sub1 = COPY %4.sub0 + FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, 0, implicit %exec, implicit %flat_scr + + %6 = IMPLICIT_DEF + undef %7.sub0_sub1 = COPY %6 + %7.sub2 = COPY %3.sub0 + FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, 0, implicit %exec, implicit %flat_scr + + %8 = IMPLICIT_DEF + undef %9.sub0_sub1_sub2 = COPY %8 + %9.sub3 = COPY %3.sub0 + FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, 0, implicit %exec, implicit %flat_scr +... diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll index 82fbb7f46186..1c546ba9f74b 100644 --- a/test/CodeGen/AMDGPU/literals.ll +++ b/test/CodeGen/AMDGPU/literals.ll @@ -10,7 +10,7 @@ ; CHECK: LSHR ; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y ; CHECK-NEXT: 5 -define void @i32_literal(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @i32_literal(i32 addrspace(1)* %out, i32 %in) { entry: %0 = add i32 5, %in store i32 %0, i32 addrspace(1)* %out @@ -27,7 +27,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y ; CHECK-NEXT: 1084227584(5.0 -define void @float_literal(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @float_literal(float addrspace(1)* %out, float %in) { entry: %0 = fadd float 5.0, %in store float %0, float addrspace(1)* %out @@ -41,7 +41,7 @@ entry: ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0 -define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) { entry: store <4 x i32> , <4 x i32> addrspace(1)* %out ret void @@ -52,7 +52,7 @@ entry: ; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0 ; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0 ; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0 -define void @inline_literal_dot4(float addrspace(1)* %out) { +define amdgpu_kernel void @inline_literal_dot4(float addrspace(1)* %out) { entry: %0 = call float @llvm.r600.dot4(<4 x float> , <4 x float> ) store float %0, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/liveness.mir b/test/CodeGen/AMDGPU/liveness.mir index 112c3f8e69a6..48762e3f2ab4 100644 --- a/test/CodeGen/AMDGPU/liveness.mir +++ b/test/CodeGen/AMDGPU/liveness.mir @@ -8,7 +8,7 @@ # Should see three distinct value numbers: # CHECK: %vreg0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}} --- | - define void @test0() { ret void } + define amdgpu_kernel void @test0() { ret void } ... --- name: test0 diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll deleted file mode 100644 index 77dd4b134982..000000000000 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll +++ /dev/null @@ -1,437 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg: -; SI: v_bfe_i32 -; EG: BFE_INT -; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac -define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm: -; SI: v_bfe_i32 -; EG: BFE_INT -define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg: -; SI: v_bfe_i32 -; EG: BFE_INT -define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg: -; SI: v_bfe_i32 -; EG: BFE_INT -define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_bfe_print_arg: -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8 -define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind { - %load = load i32, i32 addrspace(1)* %src0, align 4 - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_6: -; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI: s_endpgm -define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_7: -; SI-NOT: shl -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_8: -; SI: buffer_load_dword -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 -; SI: s_endpgm -define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_13: -; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = ashr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_14: -; SI-NOT: lshr -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = lshr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24: -; SI: buffer_load_dword [[LOAD:v[0-9]+]], -; SI-NOT: v_lshl -; SI-NOT: v_ashr -; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24 -; SI: buffer_store_dword [[BFE]], -define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24) - %shl = shl i32 %bfe, 8 - %ashr = ashr i32 %shl, 8 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @simplify_demanded_bfe_sdiv -; SI: buffer_load_dword [[LOAD:v[0-9]+]] -; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16 -; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]] -; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]] -; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] -; SI: buffer_store_dword [[TMP2]] -define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %src = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone - %div = sdiv i32 %bfe, 2 - store i32 %div, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll deleted file mode 100644 index ee47b14c496d..000000000000 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll +++ /dev/null @@ -1,631 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zextload_i8: -; SI: buffer_load_ubyte -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %load = load i8, i8 addrspace(1)* %in - %ext = zext i8 %load to i32 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8: -; GCN: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 -; FIXME: Should be using s_add_i32 -; VI: v_add_i32 -; VI-NEXT: v_and_b32_e32 -; SI-NOT: {{[^@]}}bfe -; GCN: s_endpgm -define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 65535 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8 -; SI-NEXT: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80 -; SI-NEXT: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 65535 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_1: -; SI: buffer_load_dword -; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} -; SI: s_endpgm -; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1, -define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_4: -; SI-NOT: lshl -; SI-NOT: shr -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %shr = lshr i32 %shl, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_5: -; SI: buffer_load_dword -; SI-NOT: lshl -; SI-NOT: shr -; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 -; SI: s_endpgm -define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %shr = ashr i32 %shl, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_6: -; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI: s_endpgm -define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_7: -; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_8: -; SI-NOT: {{[^@]}}bfe -; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_13: -; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = ashr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_14: -; SI-NOT: lshr -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = lshr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; Make sure that SimplifyDemandedBits doesn't cause the and to be -; reduced to the bits demanded by the bfe. - -; XXX: The operand to v_bfe_u32 could also just directly be the load register. -; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg: -; SI: buffer_load_dword [[ARG:v[0-9]+]] -; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]] -; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2 -; SI-DAG: buffer_store_dword [[AND]] -; SI-DAG: buffer_store_dword [[BFE]] -; SI: s_endpgm -define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, - i32 addrspace(1)* %out1, - i32 addrspace(1)* %in) nounwind { - %src = load i32, i32 addrspace(1)* %in, align 4 - %and = and i32 %src, 63 - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4 - store i32 %and, i32 addrspace(1)* %out1, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lshr_and: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; SI: buffer_store_dword -define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = lshr i32 %a, 6 - %c = and i32 %b, 7 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_lshr_and: -; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3 -; SI: buffer_store_dword -define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = lshr i32 %a, %b - %d = and i32 %c, 7 - store i32 %d, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}and_lshr: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; SI: buffer_store_dword -define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = and i32 %a, 448 - %c = lshr i32 %b, 6 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}and_lshr2: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; SI: buffer_store_dword -define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = and i32 %a, 511 - %c = lshr i32 %b, 6 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}shl_lshr: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002 -; SI: buffer_store_dword -define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = shl i32 %a, 9 - %c = lshr i32 %b, 11 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll deleted file mode 100644 index 2336109f4dad..000000000000 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll +++ /dev/null @@ -1,56 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.fabs.f32(float) nounwind readnone -declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone - -; FUNC-LABEL: {{^}}clamp_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0 clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm - -; EG: MOV_SAT -define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], |[[ARG]]|, 0 clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -[[ARG]], 0 clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %src.fneg = fsub float -0.0, %src - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -|[[ARG]]|, 0 clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone - %src.fneg.fabs = fsub float -0.0, %src.fabs - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll deleted file mode 100644 index 78b88122229b..000000000000 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s - -; CHECK-LABEL: {{^}}cube: -; CHECK: CUBE T{{[0-9]}}.X -; CHECK: CUBE T{{[0-9]}}.Y -; CHECK: CUBE T{{[0-9]}}.Z -; CHECK: CUBE * T{{[0-9]}}.W -define amdgpu_ps void @cube() { -main_body: - %tmp = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %tmp1 = extractelement <4 x float> %tmp, i32 3 - %tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %tmp3 = extractelement <4 x float> %tmp2, i32 0 - %tmp4 = fdiv float %tmp3, %tmp1 - %tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %tmp6 = extractelement <4 x float> %tmp5, i32 1 - %tmp7 = fdiv float %tmp6, %tmp1 - %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %tmp9 = extractelement <4 x float> %tmp8, i32 2 - %tmp10 = fdiv float %tmp9, %tmp1 - %tmp11 = insertelement <4 x float> undef, float %tmp4, i32 0 - %tmp12 = insertelement <4 x float> %tmp11, float %tmp7, i32 1 - %tmp13 = insertelement <4 x float> %tmp12, float %tmp10, i32 2 - %tmp14 = insertelement <4 x float> %tmp13, float 1.000000e+00, i32 3 - %tmp15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp14) - %tmp16 = extractelement <4 x float> %tmp15, i32 0 - %tmp17 = extractelement <4 x float> %tmp15, i32 1 - %tmp18 = extractelement <4 x float> %tmp15, i32 2 - %tmp19 = extractelement <4 x float> %tmp15, i32 3 - %tmp20 = call float @llvm.fabs.f32(float %tmp18) - %tmp21 = fdiv float 1.000000e+00, %tmp20 - %tmp22 = fmul float %tmp16, %tmp21 - %tmp23 = fadd float %tmp22, 1.500000e+00 - %tmp24 = fmul float %tmp17, %tmp21 - %tmp25 = fadd float %tmp24, 1.500000e+00 - %tmp26 = insertelement <4 x float> undef, float %tmp25, i32 0 - %tmp27 = insertelement <4 x float> %tmp26, float %tmp23, i32 1 - %tmp28 = insertelement <4 x float> %tmp27, float %tmp19, i32 2 - %tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 3 - %tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> - %tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1) - call void @llvm.r600.store.swizzle(<4 x float> %tmp31, i32 0, i32 0) - ret void -} - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.fabs.f32(float) #0 - -declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) - -; Function Attrs: readnone -declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll index 59997d27683d..595f632b493d 100644 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll @@ -4,15 +4,14 @@ ; SI-LABEL: {{^}}kill_gs_const: ; SI-NOT: v_cmpx_le_f32 ; SI: s_mov_b64 exec, 0 - define amdgpu_gs void @kill_gs_const() { main_body: - %0 = icmp ule i32 0, 3 - %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %1) - %2 = icmp ule i32 3, 0 - %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %3) + %tmp = icmp ule i32 0, 3 + %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %tmp1) + %tmp2 = icmp ule i32 3, 0 + %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %tmp3) ret void } @@ -21,16 +20,16 @@ main_body: ; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] -define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { +define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) { entry: - %tmp0 = fcmp olt float %13, 0.0 - call void @llvm.AMDGPU.kill(float %14) - %tmp1 = select i1 %tmp0, float 1.0, float 0.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) + %tmp0 = fcmp olt float %arg13, 0.000000e+00 + call void @llvm.AMDGPU.kill(float %arg14) + %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 ret void } -declare void @llvm.AMDGPU.kill(float) -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.AMDGPU.kill(float) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -!0 = !{!"const", null, i32 1} +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.SI.export.ll b/test/CodeGen/AMDGPU/llvm.SI.export.ll deleted file mode 100644 index 23a32dcfd943..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.export.ll +++ /dev/null @@ -1,237 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #0 - -; GCN-LABEL: {{^}}test_export_zeroes: -; GCN: exp mrt0 off, off, off, off{{$}} -; GCN: exp mrt0 off, off, off, off done{{$}} -define void @test_export_zeroes() #0 { - - call void @llvm.SI.export(i32 0, i32 0, i32 0, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0) - call void @llvm.SI.export(i32 0, i32 0, i32 1, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0) - ret void -} - -; FIXME: Should not set up registers for the unused source registers. - -; GCN-LABEL: {{^}}test_export_en_src0: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}} -define void @test_export_en_src0() #0 { - call void @llvm.SI.export(i32 1, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_en_src1: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}} -define void @test_export_en_src1() #0 { - call void @llvm.SI.export(i32 2, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_en_src2: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}} -define void @test_export_en_src2() #0 { - call void @llvm.SI.export(i32 4, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_en_src3: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}} -define void @test_export_en_src3() #0 { - call void @llvm.SI.export(i32 8, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_en_src0_src1: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}} -define void @test_export_en_src0_src1() #0 { - call void @llvm.SI.export(i32 3, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_en_src0_src2: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}} -define void @test_export_en_src0_src2() #0 { - call void @llvm.SI.export(i32 5, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_en_src0_src3: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}} -; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}} -define void @test_export_en_src0_src3() #0 { - call void @llvm.SI.export(i32 9, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 9, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_en_src0_src1_src2_src3: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} -; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_en_src0_src1_src2_src3() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_mrt7: -; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5 -; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}} -; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}} -define void @test_export_mrt7() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5) - ret void -} - -; GCN-LABEL: {{^}}test_export_z: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} -; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_z() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_null: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} -; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_null() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_reserved10: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} -; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_reserved10() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_reserved11: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} -; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_reserved11() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_pos0: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} -; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_pos0() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_pos3: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} -; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_pos3() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_param0: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} -; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_param0() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_param31: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} -; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} -define void @test_export_param31() #0 { - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -; GCN-LABEL: {{^}}test_export_vm: -; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 -; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 -; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}} -; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}} -define void @test_export_vm() #0 { - call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0) - ret void -} - -attributes #0 = { nounwind "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll deleted file mode 100644 index 9e7c3c2e6201..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll +++ /dev/null @@ -1,59 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s -;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s -;RUN: llc < %s -march=amdgcn -mcpu=stoney -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s - -;GCN-LABEL: {{^}}main: -;GCN-NOT: s_wqm -;GCN: s_mov_b32 m0 -;GCN-DAG: v_interp_mov_f32 -;GCN-DAG: v_interp_p1_f32 -;GCN-DAG: v_interp_p2_f32 - -define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) { -main_body: - %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4) - %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7) - ret void -} - -; Thest that v_interp_p1 uses different source and destination registers -; on 16 bank LDS chips. - -; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug: -; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]] - -define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { -main_body: - %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7) - %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) - %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7) - %25 = call float @fabs(float %22) - %26 = call float @fabs(float %23) - %27 = call float @fabs(float %24) - %28 = call i32 @llvm.SI.packf16(float %25, float %26) - %29 = bitcast i32 %28 to float - %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00) - %31 = bitcast i32 %30 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31) - ret void -} - -; Function Attrs: readnone -declare float @fabs(float) #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.constant(i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind readnone } -attributes #1 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll deleted file mode 100644 index aef9f660436e..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll +++ /dev/null @@ -1,525 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}gather4_v2: -;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_v2() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4: -;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_cl: -;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_cl() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_l: -;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_l() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b: -;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_b() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_cl: -;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_b_cl() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_cl_v8: -;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_b_cl_v8() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_lz_v2: -;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_lz_v2() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_lz: -;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_lz() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -;CHECK-LABEL: {{^}}gather4_o: -;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_cl_o: -;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_cl_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_cl_o_v8: -;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_cl_o_v8() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_l_o: -;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_l_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_l_o_v8: -;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_l_o_v8() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_o: -;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_b_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_o_v8: -;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_b_o_v8() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_cl_o: -;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_b_cl_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_lz_o: -;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_lz_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -;CHECK-LABEL: {{^}}gather4_c: -;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_cl: -;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_cl() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_cl_v8: -;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_cl_v8() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_l: -;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_l() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_l_v8: -;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_l_v8() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b: -;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_b() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_v8: -;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_b_v8() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_cl: -;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_b_cl() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_lz: -;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_lz() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -;CHECK-LABEL: {{^}}gather4_c_o: -;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_o_v8: -;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_o_v8() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_cl_o: -;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_cl_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_l_o: -;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_l_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_o: -;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_b_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_cl_o: -;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_b_cl_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_lz_o: -;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_lz_o() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_lz_o_v8: -;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define amdgpu_ps void @gather4_c_lz_o_v8() { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_sgpr_bug: -; -; This crashed at some point due to a bug in FixSGPRCopies. Derived from the -; report in https://bugs.freedesktop.org/show_bug.cgi?id=96877 -; -;CHECK: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 -;CHECK: s_waitcnt lgkmcnt(0) -;CHECK: s_mov_b32 s[[LO]], 0 -;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]] dmask:0x8 -define amdgpu_ps float @gather4_sgpr_bug() { -main_body: - %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef, align 16 - %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 - %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp4 = extractelement <4 x float> %tmp2, i32 1 - %tmp9 = fadd float undef, %tmp4 - ret float %tmp9 -} - -declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll deleted file mode 100644 index ac34d31b97c1..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll +++ /dev/null @@ -1,44 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}getlod: -;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da -define amdgpu_ps void @getlod() { -main_body: - %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) - ret void -} - -;CHECK-LABEL: {{^}}getlod_v2: -;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da -define amdgpu_ps void @getlod_v2() { -main_body: - %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) - ret void -} - -;CHECK-LABEL: {{^}}getlod_v4: -;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da -define amdgpu_ps void @getlod_v4() { -main_body: - %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) - ret void -} - - -declare <4 x float> @llvm.SI.getlod.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.ll b/test/CodeGen/AMDGPU/llvm.SI.image.ll deleted file mode 100644 index 50341e3e207f..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.image.ll +++ /dev/null @@ -1,49 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}image_load: -;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @image_load() { -main_body: - %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}image_load_mip: -;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @image_load_mip() { -main_body: - %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}getresinfo: -;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @getresinfo() { -main_body: - %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll deleted file mode 100644 index 7cdd9559994e..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll +++ /dev/null @@ -1,94 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s - -; CHECK-LABEL: {{^}}v1: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xd -define amdgpu_ps void @v1(i32 %a1) { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 2 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v2: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xb -define amdgpu_ps void @v2(i32 %a1) { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v3: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xe -define amdgpu_ps void @v3(i32 %a1) { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 2 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v4: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x7 -define amdgpu_ps void @v4(i32 %a1) { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 2 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v5: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xa -define amdgpu_ps void @v5(i32 %a1) { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -; CHECK-LABEL: {{^}}v6: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x6 -define amdgpu_ps void @v6(i32 %a1) { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 2 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -; CHECK-LABEL: {{^}}v7: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x9 -define amdgpu_ps void @v7(i32 %a1) { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -declare <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll deleted file mode 100644 index 60077dc218fd..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll +++ /dev/null @@ -1,309 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}sample: -;CHECK: s_wqm -;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cl: -;CHECK: s_wqm -;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_d() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_d_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_l() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b: -;CHECK: s_wqm -;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_b() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_b_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_lz() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_cd() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_cd_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c: -;CHECK: s_wqm -;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_d() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_d_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_l() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b: -;CHECK: s_wqm -;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_b() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_b_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_lz() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_cd() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_cd_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll deleted file mode 100644 index 34d4f6825690..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll +++ /dev/null @@ -1,309 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}sample: -;CHECK: s_wqm -;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cl: -;CHECK: s_wqm -;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_d() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_d_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_l() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b: -;CHECK: s_wqm -;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_b() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_b_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_lz() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_cd() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_cd_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c: -;CHECK: s_wqm -;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_d() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_d_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_l() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b: -;CHECK: s_wqm -;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_b() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_b_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_lz() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_cd() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @sample_c_cd_cl() { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - -declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll index ee0a41f2210f..51f564d96909 100644 --- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll +++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll @@ -34,8 +34,8 @@ main_body: %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) %tmp23 = bitcast i32 %tmp22 to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23) + call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp13, float %tmp15, float %tmp17, float %tmp19, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp21, float %tmp23, float %tmp23, float %tmp23, i1 true, i1 false) ret void } @@ -45,9 +45,10 @@ declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i3 ; Function Attrs: nounwind readonly declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 attributes #0 = { nounwind readonly } +attributes #1 = { nounwind inaccessiblememonly } !0 = !{!"const", !1, i32 1} !1 = !{!"tbaa root"} diff --git a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll deleted file mode 100644 index 6984b4cf488a..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}main: -; GCN: v_cvt_pkrtz_f16_f32 -; GCN: v_cvt_pkrtz_f16_f32 -; GCN-NOT: v_cvt_pkrtz_f16_f32 - -define amdgpu_ps void @main(float %src) { -main_body: - %p1 = call i32 @llvm.SI.packf16(float undef, float %src) - %p2 = call i32 @llvm.SI.packf16(float %src, float undef) - %p3 = call i32 @llvm.SI.packf16(float undef, float undef) - %f1 = bitcast i32 %p1 to float - %f2 = bitcast i32 %p2 to float - %f3 = bitcast i32 %p3 to float - call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f1, float undef, float %f1) - call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f2, float undef, float %f2) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %f3, float undef, float %f2) - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index 9c845e84bc12..56966a19cf7b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -1,21 +1,45 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2 -declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2 -declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2 +declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2 -declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2 -declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2 +declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2 declare i32 @llvm.amdgcn.workitem.id.x() #1 +; Make sure no crash on invalid non-constant +; GCN-LABEL: {{^}}invalid_variable_order_lds_atomic_dec_ret_i32: +define amdgpu_kernel void @invalid_variable_order_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %order.var) #0 { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 %order.var, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; Make sure no crash on invalid non-constant +; GCN-LABEL: {{^}}invalid_variable_scope_lds_atomic_dec_ret_i32: +define amdgpu_kernel void @invalid_variable_scope_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %scope.var) #0 { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 %scope.var, i1 false) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; Make sure no crash on invalid non-constant +; GCN-LABEL: {{^}}invalid_variable_volatile_lds_atomic_dec_ret_i32: +define amdgpu_kernel void @invalid_variable_volatile_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i1 %volatile.var) #0 { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 %volatile.var) + store i32 %result, i32 addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] -define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) +define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void } @@ -23,9 +47,9 @@ define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 -define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void } @@ -35,25 +59,25 @@ define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_dec_u32 [[VPTR]], [[DATA]] -define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) +define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16 -define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) +define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void } @@ -61,26 +85,26 @@ define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} -define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32: ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind { - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) +define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} -define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -88,12 +112,12 @@ define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}} ; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out.gep ret void } @@ -102,19 +126,19 @@ define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} ; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42) +define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(4)* %out ret void } @@ -122,38 +146,38 @@ define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* % ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(4)* %out ret void } ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32: ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind { - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42) +define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset_addr64: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(4)* %out.gep ret void } @@ -161,11 +185,11 @@ define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 a ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset_addr64: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -173,8 +197,8 @@ define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42) +define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(4)* %out ret void } @@ -183,9 +207,9 @@ define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* % ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(4)* %out ret void } @@ -194,8 +218,8 @@ define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspac ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind { - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42) +define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -203,35 +227,35 @@ define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind { ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(4)* %out.gep ret void } ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -240,11 +264,11 @@ define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 ; SI-LABEL: {{^}}atomic_dec_shl_base_lds_0: ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]] offset:8 -define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9) + %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false) store i32 %idx.0, i32 addrspace(1)* %add_use store i32 %val0, i32 addrspace(1)* %out ret void @@ -254,8 +278,8 @@ define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) +define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void } @@ -264,9 +288,9 @@ define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 -define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void } @@ -275,8 +299,8 @@ define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) +define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -284,9 +308,9 @@ define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}} -define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -294,8 +318,8 @@ define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) +define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void } @@ -304,9 +328,9 @@ define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} -define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void } @@ -315,8 +339,8 @@ define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrsp ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind { - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) +define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -324,37 +348,37 @@ define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind { ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} -define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out.gep ret void } ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -363,11 +387,11 @@ define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) # ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64: ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 -define void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 - %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9) + %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false) store i32 %idx.0, i32 addrspace(1)* %add_use store i64 %val0, i64 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index 22097418eec4..3d64f93db2e4 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -1,21 +1,21 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2 -declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2 -declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2 +declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2 -declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2 -declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2 +declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2 declare i32 @llvm.amdgcn.workitem.id.x() #1 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] -define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) +define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void } @@ -23,9 +23,9 @@ define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 -define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void } @@ -35,25 +35,25 @@ define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_inc_u32 [[VPTR]], [[DATA]] -define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) +define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16 -define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) +define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void } @@ -61,26 +61,26 @@ define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} -define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32: ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind { - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) +define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} -define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -88,12 +88,12 @@ define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}} ; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out.gep ret void } @@ -102,11 +102,11 @@ define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} ; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -115,11 +115,11 @@ define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) # ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32: ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9) + %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false) store i32 %idx.0, i32 addrspace(1)* %add_use store i32 %val0, i32 addrspace(1)* %out ret void @@ -129,8 +129,8 @@ define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) +define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void } @@ -139,9 +139,9 @@ define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 -define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void } @@ -150,8 +150,8 @@ define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) +define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -159,9 +159,9 @@ define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}} -define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -169,8 +169,8 @@ define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) +define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void } @@ -179,9 +179,9 @@ define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} -define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void } @@ -190,8 +190,8 @@ define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrsp ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind { - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) +define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -199,45 +199,45 @@ define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind { ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} -define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out.gep ret void } ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} -define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42) +define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(4)* %out ret void } @@ -245,38 +245,38 @@ define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* % ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(4)* %out ret void } ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32: ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind { - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42) +define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset_addr64: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} -define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(4)* %out.gep ret void } @@ -284,11 +284,11 @@ define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 a ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset_addr64: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} -define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42) + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -297,31 +297,22 @@ define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64: ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 -define void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 - %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9) + %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false) store i32 %idx.0, i32 addrspace(1)* %add_use store i64 %val0, i64 addrspace(1)* %out ret void } -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind argmemonly } - - - - - - ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42) +define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(4)* %out ret void } @@ -330,9 +321,9 @@ define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* % ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(4)* %out ret void } @@ -341,8 +332,8 @@ define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspac ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind { - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42) +define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -350,34 +341,38 @@ define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind { ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} -define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(4)* %out.gep ret void } ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} -define void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42) + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false) ret void } + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind argmemonly } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll index 6d9db65e7d93..10bea8ea63b0 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll @@ -8,7 +8,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1() #0 ; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00] ; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00] ; GCN-NEXT: s_endpgm -define void @test_buffer_wbinvl1() #0 { +define amdgpu_kernel void @test_buffer_wbinvl1() #0 { call void @llvm.amdgcn.buffer.wbinvl1() ret void } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll index 746298465e58..fe60d16d90f7 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll @@ -6,7 +6,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0 ; SI-NEXT: ; BB#0: ; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] ; SI-NEXT: s_endpgm -define void @test_buffer_wbinvl1_sc() #0 { +define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 { call void @llvm.amdgcn.buffer.wbinvl1.sc() ret void } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll index 4e0f3c37f214..061c1469ed4d 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll @@ -8,7 +8,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1.vol() #0 ; CI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] ; VI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00] ; GCN: s_endpgm -define void @test_buffer_wbinvl1_vol() #0 { +define amdgpu_kernel void @test_buffer_wbinvl1_vol() #0 { call void @llvm.amdgcn.buffer.wbinvl1.vol() ; This used to crash in hazard recognizer store i8 0, i8 addrspace(1)* undef, align 1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll index 011a0fdbd219..f08d4b6c7915 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -10,7 +10,7 @@ declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b) ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] ; GCN: buffer_store_dword v[[R_I32]] ; GCN: s_endpgm -define void @class_f16( +define amdgpu_kernel void @class_f16( i32 addrspace(1)* %r, half addrspace(1)* %a, i32 addrspace(1)* %b) { @@ -31,7 +31,7 @@ entry: ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_fabs( +define amdgpu_kernel void @class_f16_fabs( i32 addrspace(1)* %r, half %a.val, i32 %b.val) { @@ -46,12 +46,12 @@ entry: ; GCN-LABEL: {{^}}class_f16_fneg ; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: s_load_dword s[[SB_I32:[0-9]+]] -; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -v[[VA_F16]], s[[SB_I32]] +; VI: v_trunc_f16_e64 v[[VA_F16:[0-9]+]], -s[[SA_F16]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[VA_F16]], s[[SB_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_fneg( +define amdgpu_kernel void @class_f16_fneg( i32 addrspace(1)* %r, half %a.val, i32 %b.val) { @@ -71,7 +71,7 @@ entry: ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_fabs_fneg( +define amdgpu_kernel void @class_f16_fabs_fneg( i32 addrspace(1)* %r, half %a.val, i32 %b.val) { @@ -91,7 +91,7 @@ entry: ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_1( +define amdgpu_kernel void @class_f16_1( i32 addrspace(1)* %r, half %a.val) { entry: @@ -108,7 +108,7 @@ entry: ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_64( +define amdgpu_kernel void @class_f16_64( i32 addrspace(1)* %r, half %a.val) { entry: @@ -126,7 +126,7 @@ entry: ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_full_mask( +define amdgpu_kernel void @class_f16_full_mask( i32 addrspace(1)* %r, half %a.val) { entry: @@ -144,7 +144,7 @@ entry: ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm -define void @class_f16_nine_bit_mask( +define amdgpu_kernel void @class_f16_nine_bit_mask( i32 addrspace(1)* %r, half %a.val) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll index 668c669e41e8..1fcdac537fba 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -14,7 +14,7 @@ declare double @llvm.fabs.f64(double) #1 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -29,7 +29,7 @@ define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) #1 %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -45,7 +45,7 @@ define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %a.fneg = fsub float -0.0, %a %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -61,7 +61,7 @@ define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) #1 %a.fneg.fabs = fsub float -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1 @@ -76,7 +76,7 @@ define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -89,7 +89,7 @@ define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -104,7 +104,7 @@ define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -118,7 +118,7 @@ define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -132,7 +132,7 @@ define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -150,7 +150,7 @@ define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace( ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -170,7 +170,7 @@ define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* % ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -190,7 +190,7 @@ define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i3 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -205,7 +205,7 @@ define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %a.fabs = call double @llvm.fabs.f64(double %a) #1 %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -221,7 +221,7 @@ define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %a.fneg = fsub double -0.0, %a %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -237,7 +237,7 @@ define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { %a.fabs = call double @llvm.fabs.f64(double %a) #1 %a.fneg.fabs = fsub double -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1 @@ -249,7 +249,7 @@ define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) ; SI-LABEL: {{^}}test_class_1_f64: ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} ; SI: s_endpgm -define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { +define amdgpu_kernel void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -259,7 +259,7 @@ define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { ; SI-LABEL: {{^}}test_class_64_f64: ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} ; SI: s_endpgm -define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { +define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -275,7 +275,7 @@ define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { +define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -290,7 +290,7 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -306,7 +306,7 @@ define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace ; XSI: v_cmp_class_f64_e32 vcc, 1.0, ; SI: v_cmp_class_f64_e32 vcc, ; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -321,7 +321,7 @@ define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* % ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64: ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} ; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -338,7 +338,7 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -358,7 +358,7 @@ define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1) ; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -381,7 +381,7 @@ define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1 ; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -416,7 +416,7 @@ define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float ad ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -436,7 +436,7 @@ define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1) ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} ; SI-NOT: v_cmp_class ; SI: s_endpgm -define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -456,7 +456,7 @@ define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1) ; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} ; SI: s_or_b64 ; SI: s_endpgm -define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { +define amdgpu_kernel void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -476,7 +476,7 @@ define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -488,7 +488,7 @@ define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { +define amdgpu_kernel void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -500,7 +500,7 @@ define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { ; SI-NOT: v_cmp_class ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, ; SI: buffer_store_dword -define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { +define amdgpu_kernel void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll index 410ac59279a5..054388607293 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.cos.f16(half %a) ; VI: v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @cos_f16( +define amdgpu_kernel void @cos_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll index f6495d8155f7..5b9c83c11cf4 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll @@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cos.f32(float) #0 ; GCN-LABEL: {{^}}v_cos_f32: ; GCN: v_cos_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @v_cos_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @v_cos_f32(float addrspace(1)* %out, float %src) #1 { %cos = call float @llvm.amdgcn.cos.f32(float %src) #0 store float %cos, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll index 22bed45ee30f..dadb070bdcf8 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll @@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubeid(float, float, float) #0 ; GCN-LABEL: {{^}}test_cubeid: ; GCN: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 { %result = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c) store float %result, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll index 565f22c5d5b6..60c4618a011b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll @@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubema(float, float, float) #0 ; GCN-LABEL: {{^}}test_cubema: ; GCN: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 { %result = call float @llvm.amdgcn.cubema(float %a, float %b, float %c) store float %result, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll index a3ba32745814..10669cf99138 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll @@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubesc(float, float, float) #0 ; GCN-LABEL: {{^}}test_cubesc: ; GCN: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 { %result = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c) store float %result, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll index d3c0f2851ead..b2770308c170 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll @@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubetc(float, float, float) #0 ; GCN-LABEL: {{^}}test_cubetc: ; GCN: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 { +define amdgpu_kernel void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 { %result = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c) store float %result, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll new file mode 100644 index 000000000000..b92eb34750d9 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -0,0 +1,166 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s + +; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32: +; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} +; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] +; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] +; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]] +define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { + %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) + store <2 x half> %result, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]] +define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { + %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) + store <2 x half> %result, <2 x half> addrspace(1)* %out + ret void +} + +; FIXME: Folds to 0 on gfx9 +; GCN-LABEL: {{^}}s_cvt_pkrtz_undef_undef: +; GCN-NEXT: ; BB#0 +; SI-NEXT: s_endpgm +; VI-NEXT: s_endpgm +; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 { + %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef) + store <2 x half> %result, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]] +; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0 +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]] +; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]] +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]] +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.a = fsub float -0.0, %a + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]] +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.b = fsub float -0.0, %b + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]] +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.a = fsub float -0.0, %a + %neg.b = fsub float -0.0, %b + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]] +define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fabs.a = call float @llvm.fabs.f32(float %a) + %neg.fabs.a = fsub float -0.0, %fabs.a + %neg.b = fsub float -0.0, %b + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b) + store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep + ret void +} + +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare float @llvm.fabs.f32(float) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 + + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll index 6c09aa592447..58250de2f891 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll @@ -9,7 +9,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @dispatch_id(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 { %tmp0 = call i64 @llvm.amdgcn.dispatch.id() store i64 %tmp0, i64 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll index 2e8625256f13..92208e7fe17c 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}test: ; GCN: enable_sgpr_dispatch_ptr = 1 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* %value = load i32, i32 addrspace(2)* %header_ptr diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll index 6d262cf497ac..e04d9e662cea 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll @@ -9,7 +9,7 @@ declare half @llvm.amdgcn.div.fixup.f16(half %a, half %b, half %c) ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16( +define amdgpu_kernel void @div_fixup_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -30,7 +30,7 @@ entry: ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_a( +define amdgpu_kernel void @div_fixup_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b, half addrspace(1)* %c) { @@ -49,7 +49,7 @@ entry: ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_b( +define amdgpu_kernel void @div_fixup_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c) { @@ -68,7 +68,7 @@ entry: ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_c( +define amdgpu_kernel void @div_fixup_f16_imm_c( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -86,7 +86,7 @@ entry: ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_a_imm_b( +define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b( half addrspace(1)* %r, half addrspace(1)* %c) { entry: @@ -102,7 +102,7 @@ entry: ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_b_imm_c( +define amdgpu_kernel void @div_fixup_f16_imm_b_imm_c( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -118,7 +118,7 @@ entry: ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @div_fixup_f16_imm_a_imm_c( +define amdgpu_kernel void @div_fixup_f16_imm_a_imm_c( half addrspace(1)* %r, half addrspace(1)* %b) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll index cc1504f2bc8d..b8fcacf46bba 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll @@ -16,7 +16,7 @@ declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readn ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { +define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -24,7 +24,7 @@ define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, fl ; GCN-LABEL: {{^}}test_div_fixup_f64: ; GCN: v_div_fixup_f64 -define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { +define amdgpu_kernel void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone store double %result, double addrspace(1)* %out, align 8 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index d408fe9f87f6..a86468b07a27 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -20,7 +20,7 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind re ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -34,7 +34,7 @@ define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, flo ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -48,7 +48,7 @@ define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -62,7 +62,7 @@ define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -70,7 +70,7 @@ define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, ; GCN-LABEL: {{^}}test_div_fmas_f64: ; GCN: v_div_fmas_f64 -define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone store double %result, double addrspace(1)* %out, align 8 ret void @@ -79,7 +79,7 @@ define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: ; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { %cmp = icmp eq i32 %i, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone store float %result, float addrspace(1)* %out, align 4 @@ -89,7 +89,7 @@ define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, f ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: ; SI: s_mov_b64 vcc, 0 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -98,7 +98,7 @@ define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, f ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: ; SI: s_mov_b64 vcc, -1 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -114,7 +114,7 @@ define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, fl ; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] ; SI: s_endpgm -define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 @@ -150,7 +150,7 @@ define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, flo ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; SI: buffer_store_dword ; SI: s_endpgm -define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { +define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll index 8e5c62c31db5..0b4f09ac6517 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -11,7 +11,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -31,7 +31,7 @@ define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -51,7 +51,7 @@ define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -71,7 +71,7 @@ define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1) ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -91,7 +91,7 @@ define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1) ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { +define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -109,7 +109,7 @@ define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float add ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { +define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -127,7 +127,7 @@ define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float add ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -145,7 +145,7 @@ define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float add ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -163,7 +163,7 @@ define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float add ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { +define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -181,7 +181,7 @@ define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double a ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { +define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -199,7 +199,7 @@ define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double a ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -217,7 +217,7 @@ define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double a ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -236,7 +236,7 @@ define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double a ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -250,7 +250,7 @@ define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -265,7 +265,7 @@ define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}} ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone %result0 = extractvalue { double, i1 } %result, 0 store double %result0, double addrspace(1)* %out, align 8 @@ -280,7 +280,7 @@ define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double % ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { +define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone %result0 = extractvalue { double, i1 } %result, 0 store double %result0, double addrspace(1)* %out, align 8 @@ -292,7 +292,7 @@ define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double % ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %a = load float, float addrspace(1)* %gep.0, align 4 @@ -308,7 +308,7 @@ define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float a ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %a = load float, float addrspace(1)* %gep.0, align 4 @@ -326,7 +326,7 @@ define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float a ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -349,7 +349,7 @@ define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspa ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm -define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll index 92d3fc8b107e..08f286a7f510 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll @@ -4,8 +4,7 @@ declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0 ; FUNC-LABEL: {{^}}ds_bpermute: ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CHECK: s_waitcnt lgkmcnt -define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind { +define amdgpu_kernel void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind { %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 ret void @@ -13,8 +12,7 @@ define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind ; CHECK-LABEL: {{^}}ds_bpermute_imm_offset: ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4 -; CHECK: s_waitcnt lgkmcnt -define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { +define amdgpu_kernel void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { %index = add i32 %base_index, 4 %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 @@ -23,8 +21,7 @@ define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 ; CHECK-LABEL: {{^}}ds_bpermute_imm_index: ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64 -; CHECK: s_waitcnt lgkmcnt -define void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { +define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll index 6d9c94191535..63618c3aed77 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll @@ -4,8 +4,7 @@ declare i32 @llvm.amdgcn.ds.permute(i32, i32) #0 ; CHECK-LABEL: {{^}}ds_permute: ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CHECK: s_waitcnt lgkmcnt -define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind { +define amdgpu_kernel void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind { %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 ret void @@ -13,8 +12,7 @@ define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind { ; CHECK-LABEL: {{^}}ds_permute_imm_offset: ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4 -; CHECK: s_waitcnt lgkmcnt -define void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { +define amdgpu_kernel void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { %index = add i32 %base_index, 4 %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0 store i32 %bpermute, i32 addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll index ef3cb00024bb..a3a78d326a62 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0 ; FUNC-LABEL: {{^}}ds_swizzle: ; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100 ; CHECK: s_waitcnt lgkmcnt -define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind { +define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind { %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 store i32 %swizzle, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll new file mode 100644 index 000000000000..b972ddb8cb77 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll @@ -0,0 +1,162 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s + +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare void @llvm.amdgcn.exp.compr.v2i16(i32, i32, <2 x i16>, <2 x i16>, i1, i1) #0 + +; GCN-LABEL: {{^}}test_export_compr_zeroes_v2f16: +; GCN: exp mrt0 off, off, off, off compr{{$}} +; GCN: exp mrt0 off, off, off, off done compr{{$}} +define amdgpu_kernel void @test_export_compr_zeroes_v2f16() #0 { + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 false, i1 false) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_en_src0_v2f16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 +; GCN: exp mrt0 [[SRC0]], [[SRC0]], off, off done compr{{$}} +define amdgpu_kernel void @test_export_compr_en_src0_v2f16() #0 { + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> , <2 x half> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_en_src1_v2f16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 +; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}} +define amdgpu_kernel void @test_export_compr_en_src1_v2f16() #0 { + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> , <2 x half> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_en_src0_src1_v2f16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 +; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}} +define amdgpu_kernel void @test_export_compr_en_src0_src1_v2f16() #0 { + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> , <2 x half> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_en_invalid2_v2f16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 +; GCN: exp mrt0 off, [[SRC0]], off, off done compr{{$}} +define amdgpu_kernel void @test_export_compr_en_invalid2_v2f16() #0 { + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> , <2 x half> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_en_invalid10_v2f16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 +; GCN: exp mrt0 off, [[SRC0]], off, [[SRC1]] done compr{{$}} +define amdgpu_kernel void @test_export_compr_en_invalid10_v2f16() #0 { + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 10, <2 x half> , <2 x half> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_mrt7_v2f16: +; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0x38003800 +; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] compr{{$}} +; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done compr{{$}} +define amdgpu_kernel void @test_export_compr_mrt7_v2f16() #0 { + call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> , <2 x half> , i1 false, i1 false) + call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> , <2 x half> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_z_v2f16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 +; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}} +; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}} +define amdgpu_kernel void @test_export_compr_z_v2f16() #0 { + call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> , <2 x half> , i1 false, i1 false) + call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> , <2 x half> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_vm_v2f16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800 +; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}} +; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}} +define amdgpu_kernel void @test_export_compr_vm_v2f16() #0 { + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> , <2 x half> , i1 false, i1 true) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> , <2 x half> , i1 true, i1 true) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_zeroes_v2i16: +; GCN: exp mrt0 off, off, off, off compr{{$}} +; GCN: exp mrt0 off, off, off, off done compr{{$}} +define amdgpu_kernel void @test_export_compr_zeroes_v2i16() #0 { + call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 false, i1 false) + call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_en_src0_v2i16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 +; GCN: exp mrt0 [[SRC0]], off, off, off done compr{{$}} +define amdgpu_kernel void @test_export_compr_en_src0_v2i16() #0 { + call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 1, <2 x i16> , <2 x i16> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_en_src1_v2i16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 +; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}} +define amdgpu_kernel void @test_export_compr_en_src1_v2i16() #0 { + call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 12, <2 x i16> , <2 x i16> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_en_src0_src1_v2i16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 +; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}} +define amdgpu_kernel void @test_export_compr_en_src0_src1_v2i16() #0 { + call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> , <2 x i16> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_mrt7_v2i16: +; GCN-DAG: v_mov_b32_e32 [[VI16:v[0-9]+]], 0x50005 +; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] compr{{$}} +; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] done compr{{$}} +define amdgpu_kernel void @test_export_compr_mrt7_v2i16() #0 { + call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> , <2 x i16> , i1 false, i1 false) + call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> , <2 x i16> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_z_v2i16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 +; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}} +; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}} +define amdgpu_kernel void @test_export_compr_z_v2i16() #0 { + call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> , <2 x i16> , i1 false, i1 false) + call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> , <2 x i16> , i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_compr_vm_v2i16: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005 +; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}} +; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}} +define amdgpu_kernel void @test_export_compr_vm_v2i16() #0 { + call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> , <2 x i16> , i1 false, i1 true) + call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> , <2 x i16> , i1 true, i1 true) + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll new file mode 100644 index 000000000000..6d2de108829d --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll @@ -0,0 +1,484 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 +declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1 + +; GCN-LABEL: {{^}}test_export_zeroes_f32: +; GCN: exp mrt0 off, off, off, off{{$}} +; GCN: exp mrt0 off, off, off, off done{{$}} +define amdgpu_kernel void @test_export_zeroes_f32() #0 { + + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false) + ret void +} + +; FIXME: Should not set up registers for the unused source registers. + +; GCN-LABEL: {{^}}test_export_en_src0_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}} +define amdgpu_kernel void @test_export_en_src0_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src1_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}} +define amdgpu_kernel void @test_export_en_src1_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src2_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}} +define amdgpu_kernel void @test_export_en_src2_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src3_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_en_src3_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src0_src1_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}} +define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src0_src2_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}} +define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src0_src3_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}} +; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src0_src1_src2_src3_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_mrt7_f32: +; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5 +; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}} +; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}} +define amdgpu_kernel void @test_export_mrt7_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_z_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_z_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_null_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_null_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_reserved10_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_reserved10_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_reserved11_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_reserved11_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_pos0_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_pos0_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_pos3_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_pos3_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_param0_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_param0_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_param31_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_param31_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_vm_f32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0 +; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}} +; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}} +define amdgpu_kernel void @test_export_vm_f32() #0 { + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 true) + ret void +} + + + + + + + + + + + + + + + +; GCN-LABEL: {{^}}test_export_zeroes_i32: +; GCN: exp mrt0 off, off, off, off{{$}} +; GCN: exp mrt0 off, off, off, off done{{$}} +define amdgpu_kernel void @test_export_zeroes_i32() #0 { + + call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 true, i1 false) + ret void +} + +; FIXME: Should not set up registers for the unused source registers. + +; GCN-LABEL: {{^}}test_export_en_src0_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}} +define amdgpu_kernel void @test_export_en_src0_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 0, i32 1, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src1_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}} +define amdgpu_kernel void @test_export_en_src1_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 0, i32 2, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src2_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}} +define amdgpu_kernel void @test_export_en_src2_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 0, i32 4, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src3_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_en_src3_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 0, i32 8, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src0_src1_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}} +define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src0_src2_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}} +define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 0, i32 5, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src0_src3_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}} +; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_en_src0_src1_src2_src3_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_mrt7_i32: +; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 5 +; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}} +; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}} +define amdgpu_kernel void @test_export_mrt7_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_z_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_z_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_null_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_null_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_reserved10_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_reserved10_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_reserved11_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_reserved11_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_pos0_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_pos0_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_pos3_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_pos3_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_param0_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_param0_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_param31_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}} +; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}} +define amdgpu_kernel void @test_export_param31_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false) + call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false) + ret void +} + +; GCN-LABEL: {{^}}test_export_vm_i32: +; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1 +; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5 +; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4 +; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}} +; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}} +define amdgpu_kernel void @test_export_vm_i32() #0 { + call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 true) + call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 true) + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind inaccessiblememonly } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll index 427ad5ef553d..c9993ee88369 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll @@ -5,9 +5,17 @@ declare i64 @llvm.amdgcn.fcmp.f32(float, float, i32) #0 declare i64 @llvm.amdgcn.fcmp.f64(double, double, i32) #0 declare float @llvm.fabs.f32(float) #0 +; GCN-LABEL: {{^}}v_fcmp_f32_dynamic_cc: +; GCN: s_endpgm +define amdgpu_kernel void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %src1, i32 %cc) { + %result = call i64 @llvm.amdgcn.fcmp.f32(float %src0, float %src1, i32 %cc) + store i64 %result, i64 addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_with_fabs: ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, {{s[0-9]+}} -define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { +define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { %temp = call float @llvm.fabs.f32(float %a) %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1) store i64 %result, i64 addrspace(1)* %out @@ -16,7 +24,7 @@ define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_both_operands_with_fabs: ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, |{{s[0-9]+}}| -define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { +define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { %temp = call float @llvm.fabs.f32(float %a) %src_input = call float @llvm.fabs.f32(float %src) %result = call i64 @llvm.amdgcn.fcmp.f32(float %src_input, float %temp, i32 1) @@ -26,7 +34,7 @@ define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, floa ; GCN-LABEL: {{^}}v_fcmp: ; GCN-NOT: v_cmp_eq_f32_e64 -define void @v_fcmp(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1) store i64 %result, i64 addrspace(1)* %out ret void @@ -34,7 +42,7 @@ define void @v_fcmp(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_oeq: ; GCN: v_cmp_eq_f32_e64 -define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1) store i64 %result, i64 addrspace(1)* %out ret void @@ -42,7 +50,7 @@ define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_one: ; GCN: v_cmp_neq_f32_e64 -define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6) store i64 %result, i64 addrspace(1)* %out ret void @@ -50,7 +58,7 @@ define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_ogt: ; GCN: v_cmp_gt_f32_e64 -define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2) store i64 %result, i64 addrspace(1)* %out ret void @@ -58,7 +66,7 @@ define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_oge: ; GCN: v_cmp_ge_f32_e64 -define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3) store i64 %result, i64 addrspace(1)* %out ret void @@ -66,7 +74,7 @@ define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_olt: ; GCN: v_cmp_lt_f32_e64 -define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4) store i64 %result, i64 addrspace(1)* %out ret void @@ -74,7 +82,7 @@ define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_ole: ; GCN: v_cmp_le_f32_e64 -define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5) store i64 %result, i64 addrspace(1)* %out ret void @@ -83,7 +91,7 @@ define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_ueq: ; GCN: v_cmp_nlg_f32_e64 -define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9) store i64 %result, i64 addrspace(1)* %out ret void @@ -91,7 +99,7 @@ define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_une: ; GCN: v_cmp_neq_f32_e64 -define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14) store i64 %result, i64 addrspace(1)* %out ret void @@ -99,7 +107,7 @@ define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_ugt: ; GCN: v_cmp_nle_f32_e64 -define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10) store i64 %result, i64 addrspace(1)* %out ret void @@ -107,7 +115,7 @@ define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_uge: ; GCN: v_cmp_nlt_f32_e64 -define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11) store i64 %result, i64 addrspace(1)* %out ret void @@ -115,7 +123,7 @@ define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_ult: ; GCN: v_cmp_nge_f32_e64 -define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12) store i64 %result, i64 addrspace(1)* %out ret void @@ -123,7 +131,7 @@ define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f32_ule: ; GCN: v_cmp_ngt_f32_e64 -define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) { +define amdgpu_kernel void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) { %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13) store i64 %result, i64 addrspace(1)* %out ret void @@ -131,7 +139,7 @@ define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_oeq: ; GCN: v_cmp_eq_f64_e64 -define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1) store i64 %result, i64 addrspace(1)* %out ret void @@ -139,7 +147,7 @@ define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_one: ; GCN: v_cmp_neq_f64_e64 -define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6) store i64 %result, i64 addrspace(1)* %out ret void @@ -147,7 +155,7 @@ define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_ogt: ; GCN: v_cmp_gt_f64_e64 -define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2) store i64 %result, i64 addrspace(1)* %out ret void @@ -155,7 +163,7 @@ define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_oge: ; GCN: v_cmp_ge_f64_e64 -define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3) store i64 %result, i64 addrspace(1)* %out ret void @@ -163,7 +171,7 @@ define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_olt: ; GCN: v_cmp_lt_f64_e64 -define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4) store i64 %result, i64 addrspace(1)* %out ret void @@ -171,7 +179,7 @@ define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_ole: ; GCN: v_cmp_le_f64_e64 -define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5) store i64 %result, i64 addrspace(1)* %out ret void @@ -179,7 +187,7 @@ define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_ueq: ; GCN: v_cmp_nlg_f64_e64 -define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9) store i64 %result, i64 addrspace(1)* %out ret void @@ -187,7 +195,7 @@ define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_une: ; GCN: v_cmp_neq_f64_e64 -define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14) store i64 %result, i64 addrspace(1)* %out ret void @@ -195,7 +203,7 @@ define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_ugt: ; GCN: v_cmp_nle_f64_e64 -define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10) store i64 %result, i64 addrspace(1)* %out ret void @@ -203,7 +211,7 @@ define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_uge: ; GCN: v_cmp_nlt_f64_e64 -define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11) store i64 %result, i64 addrspace(1)* %out ret void @@ -211,7 +219,7 @@ define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_ult: ; GCN: v_cmp_nge_f64_e64 -define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12) store i64 %result, i64 addrspace(1)* %out ret void @@ -219,7 +227,7 @@ define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) { ; GCN-LABEL: {{^}}v_fcmp_f64_ule: ; GCN: v_cmp_ngt_f64_e64 -define void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) { +define amdgpu_kernel void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) { %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13) store i64 %result, i64 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll index 54d7848da3bf..248ee9904da0 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll @@ -8,7 +8,7 @@ declare float @llvm.amdgcn.fdiv.fast(float, float) #0 ; CHECK: v_rcp_f32_e32 ; CHECK: v_mul_f32_e32 ; CHECK: v_mul_f32_e32 -define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 { +define amdgpu_kernel void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 { %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b) store float %fdiv, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll new file mode 100644 index 000000000000..a4ae37b23c5f --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_fmed3_f16: +; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { + %src0.f16 = trunc i32 %src0.arg to i16 + %src0 = bitcast i16 %src0.f16 to half + %src1.f16 = trunc i32 %src1.arg to i16 + %src1 = bitcast i16 %src1.f16 to half + %src2.f16 = trunc i32 %src2.arg to i16 + %src2 = bitcast i16 %src2.f16 to half + %mad = call half @llvm.amdgcn.fmed3.f16(half %src0, half %src1, half %src2) + store half %mad, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fmed3_srcmods_f16: +; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}| +define amdgpu_kernel void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { + %src0.f16 = trunc i32 %src0.arg to i16 + %src0 = bitcast i16 %src0.f16 to half + %src1.f16 = trunc i32 %src1.arg to i16 + %src1 = bitcast i16 %src1.f16 to half + %src2.f16 = trunc i32 %src2.arg to i16 + %src2 = bitcast i16 %src2.f16 to half + %src0.fneg = fsub half -0.0, %src0 + %src1.fabs = call half @llvm.fabs.f16(half %src1) + %src2.fabs = call half @llvm.fabs.f16(half %src2) + %src2.fneg.fabs = fsub half -0.0, %src2.fabs + %mad = call half @llvm.amdgcn.fmed3.f16(half %src0.fneg, half %src1.fabs, half %src2.fneg.fabs) + store half %mad, half addrspace(1)* %out + ret void +} + +declare half @llvm.amdgcn.fmed3.f16(half, half, half) #0 +declare half @llvm.fabs.f16(half) #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll new file mode 100644 index 000000000000..230e625ad45b --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_fmed3: +; GCN: v_med3_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 { + %mad = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2) + store float %mad, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fmed3_srcmods: +; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}| +define amdgpu_kernel void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 { + %src0.fneg = fsub float -0.0, %src0 + %src1.fabs = call float @llvm.fabs.f32(float %src1) + %src2.fabs = call float @llvm.fabs.f32(float %src2) + %src2.fneg.fabs = fsub float -0.0, %src2.fabs + %mad = call float @llvm.amdgcn.fmed3.f32(float %src0.fneg, float %src1.fabs, float %src2.fneg.fabs) + store float %mad, float addrspace(1)* %out + ret void +} + +declare float @llvm.amdgcn.fmed3.f32(float, float, float) #0 +declare float @llvm.fabs.f32(float) #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll index d5c1c0a0969b..b47d2dbc744d 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}test_mul_legacy_f32: ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 { %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) store float %result, float addrspace(1)* %out, align 4 ret void @@ -12,7 +12,7 @@ define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) # ; GCN-LABEL: {{^}}test_mul_legacy_undef0_f32: ; GCN: v_mul_legacy_f32_e32 -define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 { %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a) store float %result, float addrspace(1)* %out, align 4 ret void @@ -20,7 +20,7 @@ define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 { ; GCN-LABEL: {{^}}test_mul_legacy_undef1_f32: ; GCN: v_mul_legacy_f32_e32 -define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 { %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef) store float %result, float addrspace(1)* %out, align 4 ret void @@ -28,7 +28,7 @@ define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 { ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32: ; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |s{{[0-9]+}}| -define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) %b.fabs = call float @llvm.fabs.f32(float %b) %result = call float @llvm.amdgcn.fmul.legacy(float %a.fabs, float %b.fabs) @@ -40,7 +40,7 @@ define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float ; GCN-LABEL: {{^}}test_mad_legacy_f32: ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_add_f32_e32 -define void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 { +define amdgpu_kernel void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 { %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) %add = fadd float %mul, %c store float %add, float addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll index d8c1af036a34..026f6901fc7f 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.fract.f16(half %a) ; VI: v_fract_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fract_f16( +define amdgpu_kernel void @fract_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll index a75267b8d693..d4f1c5fd9be7 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll @@ -6,7 +6,7 @@ declare double @llvm.amdgcn.fract.f64(double) #0 ; GCN-LABEL: {{^}}v_fract_f32: ; GCN: v_fract_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @v_fract_f32(float addrspace(1)* %out, float %src) #1 { %fract = call float @llvm.amdgcn.fract.f32(float %src) store float %fract, float addrspace(1)* %out ret void @@ -14,7 +14,7 @@ define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 { ; GCN-LABEL: {{^}}v_fract_f64: ; GCN: v_fract_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @v_fract_f64(double addrspace(1)* %out, double %src) #1 { %fract = call double @llvm.amdgcn.fract.f64(double %src) store double %fract, double addrspace(1)* %out ret void @@ -22,9 +22,8 @@ define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 { ; GCN-LABEL: {{^}}v_fract_undef_f32: ; GCN-NOT: v_fract_f32 -; GCN-NOT: v0 -; GCN: buffer_store_dword v0 -define void @v_fract_undef_f32(float addrspace(1)* %out) #1 { +; GCN-NOT: store_dword +define amdgpu_kernel void @v_fract_undef_f32(float addrspace(1)* %out) #1 { %fract = call float @llvm.amdgcn.fract.f32(float undef) store float %fract, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll index 7521224058f3..dc3eb4ce191e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll @@ -6,7 +6,7 @@ declare i16 @llvm.amdgcn.frexp.exp.i16.f16(half %a) ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; VI: v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_I16]] -define void @frexp_exp_f16( +define amdgpu_kernel void @frexp_exp_f16( i16 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -21,7 +21,7 @@ entry: ; VI: v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] ; VI: v_bfe_i32 v[[R_I32:[0-9]+]], v[[R_I16]], 0, 16{{$}} ; GCN: buffer_store_dword v[[R_I32]] -define void @frexp_exp_f16_sext( +define amdgpu_kernel void @frexp_exp_f16_sext( i32 addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -37,7 +37,7 @@ entry: ; VI: v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] ; VI: v_and_b32_e32 v[[R_I32:[0-9]+]], 0xffff, v[[R_I16]] ; GCN: buffer_store_dword v[[R_I32]] -define void @frexp_exp_f16_zext( +define amdgpu_kernel void @frexp_exp_f16_zext( i32 addrspace(1)* %r, half addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll index 9c49f175f2b5..0d686147caf8 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.frexp.exp.i32.f64(double) #0 ; GCN-LABEL: {{^}}s_test_frexp_exp_f32: ; GCN: v_frexp_exp_i32_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %src) store i32 %frexp.exp, i32 addrspace(1)* %out ret void @@ -16,7 +16,7 @@ define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f32: ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}| -define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { %fabs.src = call float @llvm.fabs.f32(float %src) %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fabs.src) store i32 %frexp.exp, i32 addrspace(1)* %out @@ -25,7 +25,7 @@ define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f32: ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}| -define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 { %fabs.src = call float @llvm.fabs.f32(float %src) %fneg.fabs.src = fsub float -0.0, %fabs.src %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fneg.fabs.src) @@ -35,7 +35,7 @@ define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) ; GCN-LABEL: {{^}}s_test_frexp_exp_f64: ; GCN: v_frexp_exp_i32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} -define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %src) store i32 %frexp.exp, i32 addrspace(1)* %out ret void @@ -43,7 +43,7 @@ define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f64: ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, |{{s\[[0-9]+:[0-9]+\]}}| -define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { %fabs.src = call double @llvm.fabs.f64(double %src) %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fabs.src) store i32 %frexp.exp, i32 addrspace(1)* %out @@ -52,7 +52,7 @@ define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f64: ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, -|{{s\[[0-9]+:[0-9]+\]}}| -define void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 { %fabs.src = call double @llvm.fabs.f64(double %src) %fneg.fabs.src = fsub double -0.0, %fabs.src %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fneg.fabs.src) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll index 706537d7e21c..722cd44e99fb 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.frexp.mant.f16(half %a) ; VI: v_frexp_mant_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @frexp_mant_f16( +define amdgpu_kernel void @frexp_mant_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll index b8d63defffed..605dc3db2b98 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll @@ -8,7 +8,7 @@ declare double @llvm.amdgcn.frexp.mant.f64(double) #0 ; GCN-LABEL: {{^}}s_test_frexp_mant_f32: ; GCN: v_frexp_mant_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %src) store float %frexp.mant, float addrspace(1)* %out ret void @@ -16,7 +16,7 @@ define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f32: ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}| -define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { %fabs.src = call float @llvm.fabs.f32(float %src) %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fabs.src) store float %frexp.mant, float addrspace(1)* %out @@ -25,7 +25,7 @@ define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f32: ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}| -define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 { %fabs.src = call float @llvm.fabs.f32(float %src) %fneg.fabs.src = fsub float -0.0, %fabs.src %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fneg.fabs.src) @@ -35,7 +35,7 @@ define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %sr ; GCN-LABEL: {{^}}s_test_frexp_mant_f64: ; GCN: v_frexp_mant_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %src) store double %frexp.mant, double addrspace(1)* %out ret void @@ -43,7 +43,7 @@ define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f64: ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, |{{s\[[0-9]+:[0-9]+\]}}| -define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { %fabs.src = call double @llvm.fabs.f64(double %src) %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fabs.src) store double %frexp.mant, double addrspace(1)* %out @@ -52,7 +52,7 @@ define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f64: ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, -|{{s\[[0-9]+:[0-9]+\]}}| -define void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 { %fabs.src = call double @llvm.fabs.f64(double %src) %fneg.fabs.src = fsub double -0.0, %fabs.src %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fneg.fabs.src) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll index 6014e2ed85f8..d26fab4cebe1 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll @@ -9,7 +9,7 @@ ; CHECK-LABEL: {{^}}groupstaticsize_test0: ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}} -define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 { +define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 64 %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 @@ -23,7 +23,7 @@ define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* % ; CHECK-LABEL: {{^}}groupstaticsize_test1: ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}} -define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) { +define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) { entry: %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4 @@ -51,7 +51,7 @@ endif: ; preds = %else, %if ; Exceeds 16-bit simm limit of s_movk_i32 ; CHECK-LABEL: {{^}}large_groupstaticsize: ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} -define void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 { +define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 { %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx store volatile i32 0, i32 addrspace(3)* %gep %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll index 6d0457bc6489..aa04af7a64a9 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll @@ -4,9 +4,18 @@ declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0 declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #0 +; No crash on invalid input +; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc: +; GCN: s_endpgm +define amdgpu_kernel void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) { + %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 %cc) + store i64 %result, i64 addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}v_icmp_i32_eq: ; GCN: v_cmp_eq_u32_e64 -define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32) store i64 %result, i64 addrspace(1)* %out ret void @@ -14,14 +23,14 @@ define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_icmp: ; GCN-NOT: v_cmp_eq_u32_e64 -define void @v_icmp(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30) store i64 %result, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}v_icmp_i32_ne: ; GCN: v_cmp_ne_u32_e64 -define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33) store i64 %result, i64 addrspace(1)* %out ret void @@ -29,7 +38,7 @@ define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_icmp_u32_ugt: ; GCN: v_cmp_gt_u32_e64 -define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34) store i64 %result, i64 addrspace(1)* %out ret void @@ -37,7 +46,7 @@ define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_icmp_u32_uge: ; GCN: v_cmp_ge_u32_e64 -define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35) store i64 %result, i64 addrspace(1)* %out ret void @@ -45,7 +54,7 @@ define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_icmp_u32_ult: ; GCN: v_cmp_lt_u32_e64 -define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36) store i64 %result, i64 addrspace(1)* %out ret void @@ -53,7 +62,7 @@ define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_icmp_u32_ule: ; GCN: v_cmp_le_u32_e64 -define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37) store i64 %result, i64 addrspace(1)* %out ret void @@ -61,7 +70,7 @@ define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_icmp_i32_sgt: ; GCN: v_cmp_gt_i32_e64 -define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 { +define amdgpu_kernel void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38) store i64 %result, i64 addrspace(1)* %out ret void @@ -69,7 +78,7 @@ define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 { ; GCN-LABEL: {{^}}v_icmp_i32_sge: ; GCN: v_cmp_ge_i32_e64 -define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39) store i64 %result, i64 addrspace(1)* %out ret void @@ -77,14 +86,14 @@ define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_icmp_i32_slt: ; GCN: v_cmp_lt_i32_e64 -define void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40) store i64 %result, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}v_icmp_i32_sle: ; GCN: v_cmp_le_i32_e64 -define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41) store i64 %result, i64 addrspace(1)* %out ret void @@ -92,7 +101,7 @@ define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_icmp_i64_eq: ; GCN: v_cmp_eq_u64_e64 -define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32) store i64 %result, i64 addrspace(1)* %out ret void @@ -100,7 +109,7 @@ define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_icmp_i64_ne: ; GCN: v_cmp_ne_u64_e64 -define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33) store i64 %result, i64 addrspace(1)* %out ret void @@ -108,7 +117,7 @@ define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_icmp_u64_ugt: ; GCN: v_cmp_gt_u64_e64 -define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34) store i64 %result, i64 addrspace(1)* %out ret void @@ -116,7 +125,7 @@ define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_icmp_u64_uge: ; GCN: v_cmp_ge_u64_e64 -define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35) store i64 %result, i64 addrspace(1)* %out ret void @@ -124,7 +133,7 @@ define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_icmp_u64_ult: ; GCN: v_cmp_lt_u64_e64 -define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36) store i64 %result, i64 addrspace(1)* %out ret void @@ -132,7 +141,7 @@ define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_icmp_u64_ule: ; GCN: v_cmp_le_u64_e64 -define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37) store i64 %result, i64 addrspace(1)* %out ret void @@ -140,7 +149,7 @@ define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_icmp_i64_sgt: ; GCN: v_cmp_gt_i64_e64 -define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38) store i64 %result, i64 addrspace(1)* %out ret void @@ -148,7 +157,7 @@ define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_icmp_i64_sge: ; GCN: v_cmp_ge_i64_e64 -define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39) store i64 %result, i64 addrspace(1)* %out ret void @@ -156,14 +165,14 @@ define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_icmp_i64_slt: ; GCN: v_cmp_lt_i64_e64 -define void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40) store i64 %result, i64 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}v_icmp_i64_sle: ; GCN: v_cmp_le_i64_e64 -define void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) { %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41) store i64 %result, i64 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll index a65f422742c9..a9351dbb27d2 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}gather4_v2: ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_v2(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_v2(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -12,7 +12,7 @@ main_body: ; GCN-LABEL: {{^}}gather4: ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_cl: ; GCN: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -30,7 +30,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_l: ; GCN: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -39,7 +39,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_b: ; GCN: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -48,7 +48,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_b_cl: ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -57,7 +57,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_b_cl_v8: ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_lz_v2: ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_lz_v2(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_lz_v2(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -75,7 +75,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_lz: ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -86,7 +86,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_o: ; GCN: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -95,7 +95,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_cl_o: ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_cl_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_cl_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -104,7 +104,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_cl_o_v8: ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -113,7 +113,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_l_o: ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_l_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_l_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -122,7 +122,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_l_o_v8: ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -131,7 +131,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_b_o: ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -140,7 +140,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_b_o_v8: ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -149,7 +149,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_b_cl_o: ; GCN: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -158,7 +158,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_lz_o: ; GCN: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_lz_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_lz_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -168,7 +168,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c: ; GCN: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -177,7 +177,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_cl: ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -186,7 +186,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_cl_v8: ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -195,7 +195,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_l: ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -204,7 +204,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_l_v8: ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -213,7 +213,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_b: ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -222,7 +222,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_b_v8: ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -231,7 +231,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_b_cl: ; GCN: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -240,7 +240,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_lz: ; GCN: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -250,7 +250,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_o: ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -259,7 +259,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_o_v8: ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -268,7 +268,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_cl_o: ; GCN: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -277,7 +277,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_l_o: ; GCN: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_l_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_l_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -286,7 +286,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_b_o: ; GCN: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -295,7 +295,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_b_cl_o: ; GCN: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -304,7 +304,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_lz_o: ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -313,7 +313,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_c_lz_o_v8: ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -322,7 +322,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_f32: ; GCN: image_gather4 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @gather4_f32(float addrspace(1)* %out) { main_body: %r = call float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) store float %r, float addrspace(1)* %out @@ -331,7 +331,7 @@ main_body: ; GCN-LABEL: {{^}}gather4_v2f32: ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da -define void @gather4_v2f32(<2 x float> addrspace(1)* %out) { +define amdgpu_kernel void @gather4_v2f32(<2 x float> addrspace(1)* %out) { main_body: %r = call <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 1) store <2 x float> %r, <2 x float> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll index ef810a330017..2e78e2a4c6f5 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}getlod: ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da -define void @getlod(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @getlod(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -12,7 +12,7 @@ main_body: ; GCN-LABEL: {{^}}getlod_v2: ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da -define void @getlod_v2(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @getlod_v2(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -21,13 +21,23 @@ main_body: ; GCN-LABEL: {{^}}getlod_v4: ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da -define void @getlod_v4(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @getlod_v4(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1) store <4 x float> %r, <4 x float> addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}adjust_writemask_getlod_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_getlod_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll index 69c43ca3070a..c74c0fa15855 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -1,146 +1,144 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=CHECK,VI %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s -;CHECK-LABEL: {{^}}image_load_v4i32: -;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) { +; GCN-LABEL: {{^}}image_load_v4i32: +; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret <4 x float> %tex } -;CHECK-LABEL: {{^}}image_load_v2i32: -;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) { +; GCN-LABEL: {{^}}image_load_v2i32: +; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret <4 x float> %tex } -;CHECK-LABEL: {{^}}image_load_i32: -;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) { +; GCN-LABEL: {{^}}image_load_i32: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret <4 x float> %tex } -;CHECK-LABEL: {{^}}image_load_mip: -;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) { +; GCN-LABEL: {{^}}image_load_mip: +; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret <4 x float> %tex } -;CHECK-LABEL: {{^}}image_load_1: -;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) { +; GCN-LABEL: {{^}}image_load_1: +; GCN: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) %elt = extractelement <4 x float> %tex, i32 0 -; Only first component used, test that dmask etc. is changed accordingly ret float %elt } -;CHECK-LABEL: {{^}}image_load_f32_v2i32: -;CHECK: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) { +; GCN-LABEL: {{^}}image_load_f32_v2i32: +; GCN: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { main_body: - %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0) + %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) ret float %tex } -;CHECK-LABEL: {{^}}image_load_v2f32_v4i32: -;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) { +; GCN-LABEL: {{^}}image_load_v2f32_v4i32: +; GCN: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { main_body: - %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0) + %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) ret <2 x float> %tex } - -;CHECK-LABEL: {{^}}image_store_v4i32: -;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm -define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { +; GCN-LABEL: {{^}}image_store_v4i32: +; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm +define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_v2i32: -;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm -define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) { +; GCN-LABEL: {{^}}image_store_v2i32: +; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm +define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_i32: -;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm -define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) { +; GCN-LABEL: {{^}}image_store_i32: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_f32_i32: -;CHECK: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm -define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) { +; GCN-LABEL: {{^}}image_store_f32_i32: +; GCN: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm +define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_v2f32_v4i32: -;CHECK: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm -define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) { +; GCN-LABEL: {{^}}image_store_v2f32_v4i32: +; GCN: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm +define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_mip: -;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm -define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { +; GCN-LABEL: {{^}}image_store_mip: +; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm +define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}getresinfo: -;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @getresinfo() { +; GCN-LABEL: {{^}}getresinfo: +; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf +define amdgpu_ps void @getresinfo() #0 { main_body: - %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0) + %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r0, float %r1, float %r2, float %r3, i1 true, i1 true) #0 ret void } ; Ideally, the register allocator would avoid the wait here ; -;CHECK-LABEL: {{^}}image_store_wait: -;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) expcnt(0) -;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm -define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) { +; GCN-LABEL: {{^}}image_store_wait: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) expcnt(0) +; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm +define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 { main_body: - call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0) - call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %arg3, i32 %arg4, <8 x i32> %arg, i32 15, i1 false, i1 false, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %arg4, <8 x i32> %arg1, i32 15, i1 false, i1 false, i1 false, i1 false) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %arg4, <8 x i32> %arg2, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } @@ -149,21 +147,22 @@ main_body: ; VI-LABEL: image_load_mmo ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) { - store float 0.0, float addrspace(3)* %lds - %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) +define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) #0 { +bb: + store float 0.000000e+00, float addrspace(3)* %lds + %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 - store float 0.0, float addrspace(3)* %tmp2 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tex, float %tex, float %tex, float %tex) + store float 0.000000e+00, float addrspace(3)* %tmp2 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex, float %tex, float %tex, float %tex, i1 true, i1 true) #0 ret void } declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare void @llvm.amdgcn.image.store.f32.i32.v8i32(float, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 -declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 @@ -173,10 +172,9 @@ declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32, declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll index 752ec2d42fac..4f90b0a25eaa 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}sample: ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -12,7 +12,7 @@ main_body: ; GCN-LABEL: {{^}}sample_cl: ; GCN: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ main_body: ; GCN-LABEL: {{^}}sample_d: ; GCN: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -30,7 +30,7 @@ main_body: ; GCN-LABEL: {{^}}sample_d_cl: ; GCN: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -39,7 +39,7 @@ main_body: ; GCN-LABEL: {{^}}sample_l: ; GCN: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -48,7 +48,7 @@ main_body: ; GCN-LABEL: {{^}}sample_b: ; GCN: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -57,7 +57,7 @@ main_body: ; GCN-LABEL: {{^}}sample_b_cl: ; GCN: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ main_body: ; GCN-LABEL: {{^}}sample_lz: ; GCN: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -75,7 +75,7 @@ main_body: ; GCN-LABEL: {{^}}sample_cd: ; GCN: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -84,7 +84,7 @@ main_body: ; GCN-LABEL: {{^}}sample_cd_cl: ; GCN: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -93,7 +93,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c: ; GCN: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -102,7 +102,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_cl: ; GCN: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -111,7 +111,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_d: ; GCN: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -120,7 +120,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_d_cl: ; GCN: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -129,7 +129,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_l: ; GCN: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -138,7 +138,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_b: ; GCN: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -147,7 +147,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_b_cl: ; GCN: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -156,7 +156,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_lz: ; GCN: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -165,7 +165,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_cd: ; GCN: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -174,7 +174,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_cd_cl: ; GCN: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -183,7 +183,7 @@ main_body: ; GCN-LABEL: {{^}}sample_f32: ; GCN: image_sample {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 -define void @sample_f32(float addrspace(1)* %out) { +define amdgpu_kernel void @sample_f32(float addrspace(1)* %out) { main_body: %r = call float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) store float %r, float addrspace(1)* %out @@ -192,13 +192,221 @@ main_body: ; GCN-LABEL: {{^}}sample_v2f32: ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 -define void @sample_v2f32(<2 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_v2f32(<2 x float> addrspace(1)* %out) { main_body: %r = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) store <2 x float> %r, <2 x float> addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}adjust_writemask_sample_0: +; GCN: image_sample v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1{{$}} +define amdgpu_kernel void @adjust_writemask_sample_0(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_01: +; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x3{{$}} +define amdgpu_kernel void @adjust_writemask_sample_01(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt0 = extractelement <4 x float> %r, i32 0 + %elt1 = extractelement <4 x float> %r, i32 1 + store volatile float %elt0, float addrspace(1)* %out + store volatile float %elt1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_012: +; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x7{{$}} +define amdgpu_kernel void @adjust_writemask_sample_012(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt0 = extractelement <4 x float> %r, i32 0 + %elt1 = extractelement <4 x float> %r, i32 1 + %elt2 = extractelement <4 x float> %r, i32 2 + store volatile float %elt0, float addrspace(1)* %out + store volatile float %elt1, float addrspace(1)* %out + store volatile float %elt2, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_12: +; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x6{{$}} +define amdgpu_kernel void @adjust_writemask_sample_12(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt1 = extractelement <4 x float> %r, i32 1 + %elt2 = extractelement <4 x float> %r, i32 2 + store volatile float %elt1, float addrspace(1)* %out + store volatile float %elt2, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_03: +; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x9{{$}} +define amdgpu_kernel void @adjust_writemask_sample_03(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt0 = extractelement <4 x float> %r, i32 0 + %elt3 = extractelement <4 x float> %r, i32 3 + store volatile float %elt0, float addrspace(1)* %out + store volatile float %elt3, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_13: +; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xa{{$}} +define amdgpu_kernel void @adjust_writemask_sample_13(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt1 = extractelement <4 x float> %r, i32 1 + %elt3 = extractelement <4 x float> %r, i32 3 + store volatile float %elt1, float addrspace(1)* %out + store volatile float %elt3, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_123: +; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xe{{$}} +define amdgpu_kernel void @adjust_writemask_sample_123(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt1 = extractelement <4 x float> %r, i32 1 + %elt2 = extractelement <4 x float> %r, i32 2 + %elt3 = extractelement <4 x float> %r, i32 3 + store volatile float %elt1, float addrspace(1)* %out + store volatile float %elt2, float addrspace(1)* %out + store volatile float %elt3, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_variable_dmask_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_variable_dmask_enabled(float addrspace(1)* %out, i32 %dmask) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 %dmask, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + + +; GCN-LABEL: {{^}}adjust_writemask_sample_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_cl_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_cl_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_d_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_d_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_d_cl_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_l_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_l_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_b_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_b_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_b_cl_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_lz_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_lz_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_cd_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_cd_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_cd_cl_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll index d10fd0824692..42d7bc0e7778 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}sample: ; GCN: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -12,7 +12,7 @@ main_body: ; GCN-LABEL: {{^}}sample_cl: ; GCN: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -21,7 +21,7 @@ main_body: ; GCN-LABEL: {{^}}sample_d: ; GCN: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -30,7 +30,7 @@ main_body: ; GCN-LABEL: {{^}}sample_d_cl: ; GCN: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -39,7 +39,7 @@ main_body: ; GCN-LABEL: {{^}}sample_l: ; GCN: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -48,7 +48,7 @@ main_body: ; GCN-LABEL: {{^}}sample_b: ; GCN: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -57,7 +57,7 @@ main_body: ; GCN-LABEL: {{^}}sample_b_cl: ; GCN: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ main_body: ; GCN-LABEL: {{^}}sample_lz: ; GCN: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -75,7 +75,7 @@ main_body: ; GCN-LABEL: {{^}}sample_cd: ; GCN: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -84,7 +84,7 @@ main_body: ; GCN-LABEL: {{^}}sample_cd_cl: ; GCN: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -93,7 +93,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c: ; GCN: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -102,7 +102,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_cl: ; GCN: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -111,7 +111,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_d: ; GCN: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -120,7 +120,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_d_cl: ; GCN: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -129,7 +129,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_l: ; GCN: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_l(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -138,7 +138,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_b: ; GCN: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -147,7 +147,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_b_cl: ; GCN: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -156,7 +156,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_lz: ; GCN: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_lz(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -165,7 +165,7 @@ main_body: ; GCN-LABEL: {{^}}sample_c_cd: ; GCN: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out @@ -174,13 +174,232 @@ main_body: ; GCN-LABEL: {{^}}sample_c_cd_cl: ; GCN: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) { +define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) store <4 x float> %r, <4 x float> addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}adjust_writemask_sample_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_cl_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_cl_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_d_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_d_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_d_cl_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_l_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_l_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_b_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_b_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_b_cl_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_lz_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_lz_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_cd_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_cd_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_cd_cl_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_cl_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_cl_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_d_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_cl_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_d_cl_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_l_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_l_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_b_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_cl_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_b_cl_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_lz_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_lz_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_cd_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_cl_o_none_enabled: +; GCN-NOT: image +; GCN-NOT: store +define amdgpu_kernel void @adjust_writemask_sample_c_cd_cl_o_none_enabled(float addrspace(1)* %out) { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false) + %elt0 = extractelement <4 x float> %r, i32 0 + store float %elt0, float addrspace(1)* %out + ret void +} declare <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll index 9ba5c69a9a24..c4795a23cd5b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll @@ -1,5 +1,7 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=kabini -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s +; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s ; GCN-LABEL: {{^}}v_interp: ; GCN-NOT: s_wqm @@ -8,17 +10,17 @@ ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}} ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}} ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}} -define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x float>) { +define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x float> %arg4) #0 { main_body: - %i = extractelement <2 x float> %4, i32 0 - %j = extractelement <2 x float> %4, i32 1 - %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %3) - %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %3) - %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %3) - %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %3) - %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %3) + %i = extractelement <2 x float> %arg4, i32 0 + %j = extractelement <2 x float> %arg4, i32 1 + %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %arg3) + %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %arg3) + %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %arg3) + %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %arg3) + %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg3) %w = fadd float %p1_1, %const - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %w) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_0, float %p1_1, float %w, i1 true, i1 true) #0 ret void } @@ -37,7 +39,8 @@ main_body: ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.w{{$}} ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.w{{$}} ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}} -define amdgpu_ps void @v_interp_p1(float %i) { +define amdgpu_ps void @v_interp_p1(float %i) #0 { +bb: %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 256) %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 256) %p0_2 = call float @llvm.amdgcn.interp.p1(float %i, i32 2, i32 0, i32 256) @@ -77,7 +80,8 @@ define amdgpu_ps void @v_interp_p1(float %i) { ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.x{{$}} ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}} ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}} -define amdgpu_ps void @v_interp_p2(float %x, float %j) { +define amdgpu_ps void @v_interp_p2(float %x, float %j) #0 { +bb: %p2_0 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 0, i32 0, i32 256) %p2_1 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 1, i32 0, i32 256) %p2_2 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 2, i32 0, i32 256) @@ -118,7 +122,8 @@ define amdgpu_ps void @v_interp_p2(float %x, float %j) { ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p10, attr64.y{{$}} ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_3, attr64.y{{$}} ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_10, attr64.x{{$}} -define amdgpu_ps void @v_interp_mov(float %x, float %j) { +define amdgpu_ps void @v_interp_mov(float %x, float %j) #0 { +bb: %mov_0 = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 256) %mov_1 = call float @llvm.amdgcn.interp.mov(i32 1, i32 0, i32 0, i32 256) %mov_2 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 256) @@ -161,23 +166,57 @@ define amdgpu_ps void @v_interp_mov(float %x, float %j) { ; VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}} ; VI: s_mov_b32 m0, -1{{$}} ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) { - store float 0.0, float addrspace(3)* %lds +define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 { +bb: + store float 0.000000e+00, float addrspace(3)* %lds %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0) %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 - store float 0.0, float addrspace(3)* %tmp2 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) + store float 0.000000e+00, float addrspace(3)* %tmp2 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 ret void } -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 +; Thest that v_interp_p1 uses different source and destination registers +; on 16 bank LDS chips. -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 - -declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0 +; GCN-LABEL: {{^}}v_interp_p1_bank16_bug: +; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]] +define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg13, [17 x <4 x i32>] addrspace(2)* byval %arg14, [34 x <8 x i32>] addrspace(2)* byval %arg15, float inreg %arg16, i32 inreg %arg17, <2 x i32> %arg18, <2 x i32> %arg19, <2 x i32> %arg20, <3 x i32> %arg21, <2 x i32> %arg22, <2 x i32> %arg23, <2 x i32> %arg24, float %arg25, float %arg26, float %arg27, float %arg28, float %arg29, float %arg30, i32 %arg31, float %arg32, float %arg33) #0 { +main_body: + %i.i = extractelement <2 x i32> %arg19, i32 0 + %j.i = extractelement <2 x i32> %arg19, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg17) #0 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg17) #0 + %i.i7 = extractelement <2 x i32> %arg19, i32 0 + %j.i8 = extractelement <2 x i32> %arg19, i32 1 + %i.f.i9 = bitcast i32 %i.i7 to float + %j.f.i10 = bitcast i32 %j.i8 to float + %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 0, i32 %arg17) #0 + %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 0, i32 %arg17) #0 + %i.i1 = extractelement <2 x i32> %arg19, i32 0 + %j.i2 = extractelement <2 x i32> %arg19, i32 1 + %i.f.i3 = bitcast i32 %i.i1 to float + %j.f.i4 = bitcast i32 %j.i2 to float + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 0, i32 %arg17) #0 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 0, i32 %arg17) #0 + %tmp = call float @llvm.fabs.f32(float %p2.i) + %tmp34 = call float @llvm.fabs.f32(float %p2.i12) + %tmp35 = call float @llvm.fabs.f32(float %p2.i6) + %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp, float %tmp34) + %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp35, float 1.000000e+00) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 true) #0 + ret void +} -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 -attributes #0 = { nounwind readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index 5d4d4cd7ee46..055dddbfa8af 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -8,7 +8,7 @@ ; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa -define void @test(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 { %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)* %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10 @@ -20,7 +20,7 @@ define void @test(i32 addrspace(1)* %out) #1 { ; ALL-LABEL: {{^}}test_implicit: ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15 -define void @test_implicit(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10 @@ -39,7 +39,7 @@ define void @test_implicit(i32 addrspace(1)* %out) #1 { ; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]] ; MESA: buffer_store_dword [[V_VAL]] ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]] -define void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 { +define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 { %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() %arg.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* %val = load i32, i32 addrspace(2)* %arg.ptr diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll index 6720cbe9d8da..fe211d356070 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b) ; GCN: buffer_load_dword v[[B_I32:[0-9]+]] ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I32]] ; GCN: buffer_store_short v[[R_F16]] -define void @ldexp_f16( +define amdgpu_kernel void @ldexp_f16( half addrspace(1)* %r, half addrspace(1)* %a, i32 addrspace(1)* %b) { @@ -22,7 +22,7 @@ define void @ldexp_f16( ; GCN: buffer_load_dword v[[B_I32:[0-9]+]] ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]] ; GCN: buffer_store_short v[[R_F16]] -define void @ldexp_f16_imm_a( +define amdgpu_kernel void @ldexp_f16_imm_a( half addrspace(1)* %r, i32 addrspace(1)* %b) { %b.val = load i32, i32 addrspace(1)* %b @@ -35,7 +35,7 @@ define void @ldexp_f16_imm_a( ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; VI: v_ldexp_f16_e64 v[[R_F16:[0-9]+]], v[[A_F16]], 2{{$}} ; GCN: buffer_store_short v[[R_F16]] -define void @ldexp_f16_imm_b( +define amdgpu_kernel void @ldexp_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { %a.val = load half, half addrspace(1)* %a diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll index a23defd742a8..1ab4e8b80630 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll @@ -7,7 +7,7 @@ declare double @llvm.amdgcn.ldexp.f64(double, i32) nounwind readnone ; SI-LABEL: {{^}}test_ldexp_f32: ; SI: v_ldexp_f32 ; SI: s_endpgm -define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { +define amdgpu_kernel void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void @@ -16,7 +16,7 @@ define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind ; SI-LABEL: {{^}}test_ldexp_f64: ; SI: v_ldexp_f64 ; SI: s_endpgm -define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { +define amdgpu_kernel void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone store double %result, double addrspace(1)* %out, align 8 ret void @@ -24,7 +24,7 @@ define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwi ; SI-LABEL: {{^}}test_ldexp_undef_f32: ; SI-NOT: v_ldexp_f32 -define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind { +define amdgpu_kernel void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind { %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone store float %result, float addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll index 014369b45015..bc599897f82a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll @@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_lerp: ; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind { +define amdgpu_kernel void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind { %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll index f78257f1d226..feecd6c0e35d 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll @@ -7,7 +7,7 @@ declare float @llvm.amdgcn.log.clamp.f32(float) #0 ; GCN-LABEL: {{^}}v_log_clamp_f32: ; GCN: v_log_clamp_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 { %log.clamp = call float @llvm.amdgcn.log.clamp.f32(float %src) #0 store float %log.clamp, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll index 303446b63315..ab76c870796b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll @@ -1,24 +1,22 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}mbcnt_intrinsics: ; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0 ; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]] ; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]] - -define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) { +define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) { main_body: - %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1 - %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1 - %4 = bitcast i32 %hi to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %4, float %4, float %4, float %4) + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #0 + %tmp = bitcast i32 %hi to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp, float %tmp, float %tmp, float %tmp, i1 true, i1 true) #1 ret void } -declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 - -declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll index 35fdba8f34a3..8baaad190406 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -7,7 +7,7 @@ ; VI: v_mov_b32_e32 v0, s{{[0-9]+}} ; VI: s_nop 1 ; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] -define void @dpp_test(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 store i32 %tmp0, i32 addrspace(1)* %out ret void @@ -19,7 +19,7 @@ define void @dpp_test(i32 addrspace(1)* %out, i32 %in) { ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI: s_nop 1 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 -define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0 store i32 %tmp1, i32 addrspace(1)* %out @@ -36,7 +36,7 @@ define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI: s_nop 1 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 -define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) { +define amdgpu_kernel void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) { %cmp = fcmp oeq float %cond, 0.0 br i1 %cmp, label %if, label %else diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll index 7c2495e096ec..3a2b87cd87f3 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll @@ -5,7 +5,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8: ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0 store i64 %result, i64 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate: ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) { +define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) { %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0 store i64 %result, i64 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll index 04bb97a9eb57..a8d03bf6bbac 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll @@ -5,7 +5,7 @@ declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) { %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> ) #0 store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) { +define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) { %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0 store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { +define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> ) #0 store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -29,7 +29,7 @@ define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %ou ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { +define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) { %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> ) #0 store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -37,7 +37,7 @@ define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i6 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr: ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) { +define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) { %in = load <4 x i32>, <4 x i32> addrspace(1) * %input %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll index 83d13ab26846..dfaac042227c 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll @@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.msad.u8(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_msad_u8: ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_msad_u8_non_immediate: ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { +define amdgpu_kernel void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 %a, i32 %b) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll index fd1a463fd3e9..f0af876567b4 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s ; CHECK-LABEL: {{^}}test1: ; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec @@ -7,7 +7,7 @@ ; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However, ; the expectation is that the intrinsic will be used in non-trivial shaders, ; so such an optimization doesn't seem worth the effort. -define amdgpu_ps float @test1() { +define amdgpu_ps float @test1() #0 { %live = call i1 @llvm.amdgcn.ps.live() %live.32 = zext i1 %live to i32 %r = bitcast i32 %live.32 to float @@ -19,12 +19,11 @@ define amdgpu_ps float @test1() { ; CHECK-DAG: s_wqm_b64 exec, exec ; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]] ; CHECK: image_sample v0, [[VAR]], -define amdgpu_ps float @test2() { +define amdgpu_ps float @test2() #0 { %live = call i1 @llvm.amdgcn.ps.live() %live.32 = zext i1 %live to i32 - - %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %live.32.bc = bitcast i32 %live.32 to float + %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %live.32.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %r = extractelement <4 x float> %t, i32 0 ret float %r } @@ -35,7 +34,7 @@ define amdgpu_ps float @test2() { ; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1 ; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]] ; CHECK: ; %dead -define amdgpu_ps float @test3(i32 %in) { +define amdgpu_ps float @test3(i32 %in) #0 { entry: %live = call i1 @llvm.amdgcn.ps.live() br i1 %live, label %end, label %dead @@ -46,14 +45,15 @@ dead: end: %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ] - %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %tc.bc = bitcast i32 %tc to float + %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %tc.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %r = extractelement <4 x float> %t, i32 0 ret float %r } -declare i1 @llvm.amdgcn.ps.live() #0 - -declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare i1 @llvm.amdgcn.ps.live() #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -attributes #0 = { nounwind readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll index ece4224f6e67..be71225c5e06 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll @@ -5,7 +5,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8: ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { +define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0 store i64 %result, i64 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate: ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) { +define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) { %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0 store i64 %result, i64 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll index 6bf871543ca2..9200fe7c67b1 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}test: ; GCN: enable_sgpr_queue_ptr = 1 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* %value = load i32, i32 addrspace(2)* %header_ptr diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll index f0b8e2a0293f..0f1fa15f47cc 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.rcp.f16(half %a) ; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @rcp_f16( +define amdgpu_kernel void @rcp_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll index d53861456c78..71db76d902b7 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll @@ -7,7 +7,7 @@ declare float @llvm.amdgcn.rcp.legacy(float) #0 ; GCN-LABEL: {{^}}rcp_legacy_f32: ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 { %rcp = call float @llvm.amdgcn.rcp.legacy(float %src) #0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -16,7 +16,7 @@ define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 { ; TODO: Really these should be constant folded ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_4.0 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 4.0 -define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.legacy(float 4.0) #0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -24,7 +24,7 @@ define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_100.0 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000 -define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.legacy(float 100.0) #0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -32,7 +32,7 @@ define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { ; GCN-LABEL: {{^}}rcp_legacy_undef_f32: ; GCN-NOT: v_rcp_legacy_f32 -define void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.legacy(float undef) store float %rcp, float addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll index 825231bf8680..ad2d84b7911b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -8,17 +8,35 @@ declare float @llvm.sqrt.f32(float) #0 ; FUNC-LABEL: {{^}}rcp_undef_f32: ; SI-NOT: v_rcp_f32 -define void @rcp_undef_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rcp_undef_f32(float addrspace(1)* %out) #1 { %rcp = call float @llvm.amdgcn.rcp.f32(float undef) store float %rcp, float addrspace(1)* %out, align 4 ret void } +; FUNC-LABEL: {{^}}rcp_2_f32: +; SI-NOT: v_rcp_f32 +; SI: v_mov_b32_e32 v{{[0-9]+}}, 0.5 +define amdgpu_kernel void @rcp_2_f32(float addrspace(1)* %out) #1 { + %rcp = call float @llvm.amdgcn.rcp.f32(float 2.0) + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_10_f32: +; SI-NOT: v_rcp_f32 +; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x3dcccccd +define amdgpu_kernel void @rcp_10_f32(float addrspace(1)* %out) #1 { + %rcp = call float @llvm.amdgcn.rcp.f32(float 10.0) + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + ; FUNC-LABEL: {{^}}safe_no_fp32_denormals_rcp_f32: ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dword [[RESULT]] -define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -28,7 +46,7 @@ define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dword [[RESULT]] -define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 { +define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -36,7 +54,7 @@ define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src ; FUNC-LABEL: {{^}}unsafe_f32_denormals_rcp_pat_f32: ; SI: v_div_scale_f32 -define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 { +define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -45,7 +63,7 @@ define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %s ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32: ; SI: v_sqrt_f32_e32 ; SI: v_rcp_f32_e32 -define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 { %sqrt = call float @llvm.sqrt.f32(float %src) %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt) store float %rcp, float addrspace(1)* %out, align 4 @@ -54,7 +72,7 @@ define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 { ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32: ; SI: v_rsq_f32_e32 -define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 { +define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 { %sqrt = call float @llvm.sqrt.f32(float %src) %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt) store float %rcp, float addrspace(1)* %out, align 4 @@ -65,7 +83,7 @@ define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 { ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @rcp_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double %src) #1 { %rcp = call double @llvm.amdgcn.rcp.f64(double %src) store double %rcp, double addrspace(1)* %out, align 8 ret void @@ -75,7 +93,7 @@ define void @rcp_f64(double addrspace(1)* %out, double %src) #1 { ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 { +define amdgpu_kernel void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 { %rcp = call double @llvm.amdgcn.rcp.f64(double %src) store double %rcp, double addrspace(1)* %out, align 8 ret void @@ -83,7 +101,7 @@ define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 { ; FUNC-LABEL: {{^}}rcp_pat_f64: ; SI: v_div_scale_f64 -define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { %rcp = fdiv double 1.0, %src store double %rcp, double addrspace(1)* %out, align 8 ret void @@ -93,7 +111,7 @@ define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { +define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { %rcp = fdiv double 1.0, %src store double %rcp, double addrspace(1)* %out, align 8 ret void @@ -103,7 +121,7 @@ define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { ; SI-NOT: v_rsq_f64_e32 ; SI: v_sqrt_f64 ; SI: v_rcp_f64 -define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { %sqrt = call double @llvm.sqrt.f64(double %src) %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) store double %rcp, double addrspace(1)* %out, align 8 @@ -114,7 +132,7 @@ define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { ; SI: v_rsq_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} ; SI-NOT: [[RESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { +define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { %sqrt = call double @llvm.sqrt.f64(double %src) %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt) store double %rcp, double addrspace(1)* %out, align 8 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 2569108e7b18..9f5c809455ea 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #0 ; CHECK-LABEL: {{^}}test_readfirstlane: ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}} -define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { +define amdgpu_kernel void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { ; CHECK-LABEL: {{^}}test_readfirstlane_imm: ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]] -define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void @@ -25,7 +25,7 @@ define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]] ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]] -define void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"() %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 436ffff692c6..5e892fad3741 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.readlane(i32, i32) #0 ; CHECK-LABEL: {{^}}test_readlane_sreg: ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1) store i32 %readlane, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 ; CHECK-LABEL: {{^}}test_readlane_imm_sreg: ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}} -define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { +define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1) store i32 %readlane, i32 addrspace(1)* %out, align 4 ret void @@ -25,7 +25,7 @@ define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]] ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}} -define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { +define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"() %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1) store i32 %readlane, i32 addrspace(1)* %out, align 4 @@ -34,7 +34,7 @@ define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { ; CHECK-LABEL: {{^}}test_readlane_imm: ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32 -define void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 { +define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 { %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 32) #0 store i32 %readlane, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll index 5f40e0d0986f..3611047f1277 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -12,7 +12,7 @@ declare double @llvm.amdgcn.rsq.clamp.f64(double) #1 ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] ; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]] ; VI: buffer_store_dword [[RESULT]] -define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 { %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src) store float %rsq_clamp, float addrspace(1)* %out ret void @@ -30,7 +30,7 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 { ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}} ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] ; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]] -define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { +define amdgpu_kernel void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) store double %rsq_clamp, double addrspace(1)* %out ret void @@ -38,7 +38,7 @@ define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { ; FUNC-LABEL: {{^}}rsq_clamp_undef_f32: ; SI-NOT: v_rsq_clamp_f32 -define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 { %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef) store float %rsq_clamp, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll index 2022d0289862..fd4802140810 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.rsq.f16(half %a) ; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @rsq_f16( +define amdgpu_kernel void @rsq_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll index 47bd0d82b834..7f4c2cb19a32 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll @@ -4,7 +4,7 @@ declare float @llvm.amdgcn.rsq.legacy(float) #0 ; FUNC-LABEL: {{^}}rsq_legacy_f32: ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { ; TODO: Really these should be constant folded ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0 -define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000 -define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -29,7 +29,7 @@ define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { ; FUNC-LABEL: {{^}}rsq_legacy_undef_f32: ; SI-NOT: v_rsq_legacy_f32 -define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float undef) store float %rsq, float addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll index c644288977a3..0ce26d0fe876 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll @@ -6,7 +6,7 @@ declare double @llvm.amdgcn.rsq.f64(double) #0 ; FUNC-LABEL: {{^}}rsq_f32: ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @rsq_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @rsq_f32(float addrspace(1)* %out, float %src) #1 { %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -15,7 +15,7 @@ define void @rsq_f32(float addrspace(1)* %out, float %src) #1 { ; TODO: Really these should be constant folded ; FUNC-LABEL: {{^}}rsq_f32_constant_4.0 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0 -define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -23,7 +23,7 @@ define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 { ; FUNC-LABEL: {{^}}rsq_f32_constant_100.0 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000 -define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -31,7 +31,7 @@ define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 { ; FUNC-LABEL: {{^}}rsq_f64: ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @rsq_f64(double addrspace(1)* %out, double %src) #1 { +define amdgpu_kernel void @rsq_f64(double addrspace(1)* %out, double %src) #1 { %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0 store double %rsq, double addrspace(1)* %out, align 4 ret void @@ -40,7 +40,7 @@ define void @rsq_f64(double addrspace(1)* %out, double %src) #1 { ; TODO: Really these should be constant folded ; FUNC-LABEL: {{^}}rsq_f64_constant_4.0 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0 -define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 { %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0 store double %rsq, double addrspace(1)* %out, align 4 ret void @@ -50,7 +50,7 @@ define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 { ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}} ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 { %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0 store double %rsq, double addrspace(1)* %out, align 4 ret void @@ -58,7 +58,7 @@ define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 { ; FUNC-LABEL: {{^}}rsq_undef_f32: ; SI-NOT: v_rsq_f32 -define void @rsq_undef_f32(float addrspace(1)* %out) #1 { +define amdgpu_kernel void @rsq_undef_f32(float addrspace(1)* %out) #1 { %rsq = call float @llvm.amdgcn.rsq.f32(float undef) store float %rsq, float addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 132e476d5e29..5f8ca28ec5f0 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -1,10 +1,13 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}test_barrier: -; GCN: buffer_store_dword -; GCN: s_waitcnt +; GFX8: buffer_store_dword +; GFX8: s_waitcnt +; GFX9: flat_store_dword +; GFX9-NOT: s_waitcnt ; GCN: s_barrier -define void @test_barrier(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out) #0 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll index ecd4ac6824cc..b488565c6b3a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll @@ -9,7 +9,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0 ; SI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0xc0,0xc7] ; VI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0x80,0xc0,0x00,0x00,0x00,0x00] ; GCN-NEXT: s_endpgm -define void @test_s_dcache_inv() #0 { +define amdgpu_kernel void @test_s_dcache_inv() #0 { call void @llvm.amdgcn.s.dcache.inv() ret void } @@ -18,7 +18,7 @@ define void @test_s_dcache_inv() #0 { ; GCN-NEXT: ; BB#0: ; GCN: s_dcache_inv ; GCN: s_waitcnt lgkmcnt(0) ; encoding -define void @test_s_dcache_inv_insert_wait() #0 { +define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.inv() call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll index 097f35d42c4f..a3a5c329f411 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll @@ -9,7 +9,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0 ; CI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x40,0xc7] ; VI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x88,0xc0,0x00,0x00,0x00,0x00] ; GCN-NEXT: s_endpgm -define void @test_s_dcache_inv_vol() #0 { +define amdgpu_kernel void @test_s_dcache_inv_vol() #0 { call void @llvm.amdgcn.s.dcache.inv.vol() ret void } @@ -18,7 +18,7 @@ define void @test_s_dcache_inv_vol() #0 { ; GCN-NEXT: ; BB#0: ; GCN-NEXT: s_dcache_inv_vol ; GCN: s_waitcnt lgkmcnt(0) ; encoding -define void @test_s_dcache_inv_vol_insert_wait() #0 { +define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.inv.vol() call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll index 9ecce7463f6b..909a85dda3e8 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll @@ -7,7 +7,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0 ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_endpgm -define void @test_s_dcache_wb() #0 { +define amdgpu_kernel void @test_s_dcache_wb() #0 { call void @llvm.amdgcn.s.dcache.wb() ret void } @@ -16,7 +16,7 @@ define void @test_s_dcache_wb() #0 { ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb ; VI: s_waitcnt lgkmcnt(0) ; encoding -define void @test_s_dcache_wb_insert_wait() #0 { +define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.wb() call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll index 943f8c67a2e3..217bf97c41a4 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll @@ -7,7 +7,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0 ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb_vol ; encoding: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_endpgm -define void @test_s_dcache_wb_vol() #0 { +define amdgpu_kernel void @test_s_dcache_wb_vol() #0 { call void @llvm.amdgcn.s.dcache.wb.vol() ret void } @@ -16,7 +16,7 @@ define void @test_s_dcache_wb_vol() #0 { ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb_vol ; VI: s_waitcnt lgkmcnt(0) ; encoding -define void @test_s_dcache_wb_vol_insert_wait() #0 { +define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.wb.vol() call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll index 72513fc86f49..8f64c50b9c60 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll @@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.decperflevel(i32) #0 ; GCN: s_decperflevel 13{{$}} ; GCN: s_decperflevel 14{{$}} ; GCN: s_decperflevel 15{{$}} -define void @test_s_decperflevel(i32 %x) #0 { +define amdgpu_kernel void @test_s_decperflevel(i32 %x) #0 { call void @llvm.amdgcn.s.decperflevel(i32 0) call void @llvm.amdgcn.s.decperflevel(i32 1) call void @llvm.amdgcn.s.decperflevel(i32 2) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll index 4304398182a6..906a8a3e05f4 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}s_getreg_test: ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23) -define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size. +define amdgpu_kernel void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size. %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) %lds_size_bytes = shl i32 %lds_size_64dwords, 8 store i32 %lds_size_bytes, i32 addrspace(1)* %out @@ -14,7 +14,7 @@ define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size ; Call site has additional readnone knowledge. ; GCN-LABEL: {{^}}readnone_s_getreg_test: ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23) -define void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size. +define amdgpu_kernel void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size. %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) #1 %lds_size_bytes = shl i32 %lds_size_64dwords, 8 store i32 %lds_size_bytes, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll index 2ae4fc473eaa..49e6e4257906 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll @@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.incperflevel(i32) #0 ; GCN: s_incperflevel 13{{$}} ; GCN: s_incperflevel 14{{$}} ; GCN: s_incperflevel 15{{$}} -define void @test_s_incperflevel(i32 %x) #0 { +define amdgpu_kernel void @test_s_incperflevel(i32 %x) #0 { call void @llvm.amdgcn.s.incperflevel(i32 0) call void @llvm.amdgcn.s.incperflevel(i32 1) call void @llvm.amdgcn.s.incperflevel(i32 2) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll index d8eda10fdfd8..66041037168a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll @@ -10,7 +10,7 @@ declare i64 @llvm.amdgcn.s.memrealtime() #0 ; GCN-NOT: lgkmcnt ; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}} ; GCN: _store_dwordx2 -define void @test_s_memrealtime(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_s_memrealtime(i64 addrspace(1)* %out) #0 { %cycle0 = call i64 @llvm.amdgcn.s.memrealtime() store volatile i64 %cycle0, i64 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll index ff9d74619788..6aef769bafad 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll @@ -11,7 +11,7 @@ declare i64 @llvm.amdgcn.s.memtime() #0 ; GCN-NOT: lgkmcnt ; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 -define void @test_s_memtime(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_s_memtime(i64 addrspace(1)* %out) #0 { %cycle0 = call i64 @llvm.amdgcn.s.memtime() store volatile i64 %cycle0, i64 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll index 870aa48a3417..59c910c71c5a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll @@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.sleep(i32) #0 ; GCN: s_sleep 13{{$}} ; GCN: s_sleep 14{{$}} ; GCN: s_sleep 15{{$}} -define void @test_s_sleep(i32 %x) #0 { +define amdgpu_kernel void @test_s_sleep(i32 %x) #0 { call void @llvm.amdgcn.s.sleep(i32 0) call void @llvm.amdgcn.s.sleep(i32 1) call void @llvm.amdgcn.s.sleep(i32 2) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll index 3aaed9d53772..2a3705de2b44 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll @@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.hi.u8(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_sad_hi_u8: ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_sad_hi_u8_non_immediate: ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { +define amdgpu_kernel void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 %a, i32 %b) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll index 5438571c5821..c404531513e7 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll @@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.u16(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_sad_u16: ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_sad_u16_non_immediate: ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { +define amdgpu_kernel void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 %a, i32 %b) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll index 9422d7620ca6..1ee876aa724e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll @@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.u8(i32, i32, i32) #0 ; GCN-LABEL: {{^}}v_sad_u8: ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) { +define amdgpu_kernel void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) { %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 100, i32 100) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) { ; GCN-LABEL: {{^}}v_sad_u8_non_immediate: ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { +define amdgpu_kernel void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) { %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 %a, i32 %b) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll new file mode 100644 index 000000000000..593c95856811 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll @@ -0,0 +1,556 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg: +; GCN: v_bfe_i32 +define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_arg_arg_imm: +; GCN: v_bfe_i32 +define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_arg_imm_arg: +; GCN: v_bfe_i32 +define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_imm_arg_arg: +; GCN: v_bfe_i32 +define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_bfe_print_arg: +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8 +define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 { + %load = load i32, i32 addrspace(1)* %src0, align 4 + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset: +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset: +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_test_6: +; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_test_7: +; GCN-NOT: shl +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_test_8: +; GCN: buffer_load_dword +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_test_9: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_test_10: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_test_11: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_test_12: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_test_13: +; GCN: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; GCN-LABEL: {{^}}bfe_i32_test_14: +; GCN-NOT: lshr +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_0: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_1: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_2: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_3: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_4: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_5: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_6: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_7: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_8: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_9: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_10: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_11: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -6 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_12: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_13: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_14: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_15: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_16: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_17: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_18: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { + %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1) + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_sext_in_reg_i24: +; GCN: buffer_load_dword [[LOAD:v[0-9]+]], +; GCN-NOT: v_lshl +; GCN-NOT: v_ashr +; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24 +; GCN: buffer_store_dword [[BFE]], +define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24) + %shl = shl i32 %bfe, 8 + %ashr = ashr i32 %shl, 8 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: @simplify_demanded_bfe_sdiv +; GCN: buffer_load_dword [[LOAD:v[0-9]+]] +; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16 +; GCN: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]] +; GCN: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]] +; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] +; GCN: buffer_store_dword [[TMP2]] +define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %src = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16) + %div = sdiv i32 %bfe, 2 + store i32 %div, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_0_width: +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 8, i32 0) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_8_bfe_8: +; GCN: v_bfe_i32 +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8) + %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8) + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_8_bfe_16: +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; GCN: s_endpgm +define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8) + %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 16) + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; This really should be folded into 1 +; GCN-LABEL: {{^}}bfe_16_bfe_8: +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 16) + %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8) + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure there isn't a redundant BFE +; GCN-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe: +; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + %c = add i32 %a, %b ; add to prevent folding into extload + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 0, i32 8) + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: +define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + %c = add i32 %a, %b ; add to prevent folding into extload + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 8, i32 0) + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}sextload_i8_to_i32_bfe: +; GCN: buffer_load_sbyte +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { + %load = load i8, i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 0, i32 8) + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN: .text +; GCN-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { + %load = load i8, i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 8, i32 0) + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0: +; GCN-NOT: shr +; GCN-NOT: shl +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 +; GCN: s_endpgm +define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1: +; GCN: buffer_load_dword +; GCN-NOT: shl +; GCN-NOT: shr +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 +; GCN: s_endpgm +define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1: +; GCN: buffer_load_dword +; GCN-NOT: v_lshl +; GCN-NOT: v_ashr +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2 +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 +; GCN: s_endpgm +define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 2) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll new file mode 100644 index 000000000000..94aeb077ebef --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll @@ -0,0 +1,127 @@ +;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}test_interrupt: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsg sendmsg(MSG_INTERRUPT) +define amdgpu_kernel void @test_interrupt() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0); + ret void +} + +; GCN-LABEL: {{^}}test_gs_emit: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) +define amdgpu_kernel void @test_gs_emit() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0); + ret void +} + +; GCN-LABEL: {{^}}test_gs_cut: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) +define amdgpu_kernel void @test_gs_cut() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0); + ret void +} + +; GCN-LABEL: {{^}}test_gs_emit_cut: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) +define amdgpu_kernel void @test_gs_emit_cut() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0) + ret void +} + +; GCN-LABEL: {{^}}test_gs_done: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) +define amdgpu_kernel void @test_gs_done() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) + ret void +} + +; GCN-LABEL: {{^}}sendmsg: +; GCN: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP) +; GCN-NEXT: s_endpgm +define amdgpu_gs void @sendmsg(i32 inreg %a) #0 { + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a) + ret void +} + +; GCN-LABEL: {{^}}sendmsghalt: +; GCN: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT) +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @sendmsghalt(i32 inreg %a) #0 { + call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a) + ret void +} + +; GCN-LABEL: {{^}}test_interrupt_halt: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsghalt sendmsg(MSG_INTERRUPT) +define amdgpu_kernel void @test_interrupt_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}test_gs_emit_halt: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0) +define amdgpu_kernel void @test_gs_emit_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0) + ret void +} + +; GCN-LABEL: {{^}}test_gs_cut_halt: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1) +define amdgpu_kernel void @test_gs_cut_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0) + ret void +} + +; GCN-LABEL: {{^}}test_gs_emit_cut_halt: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) +define amdgpu_kernel void @test_gs_emit_cut_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0) + ret void +} + +; GCN-LABEL: {{^}}test_gs_done_halt: +; GCN: s_mov_b32 m0, 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP) +define amdgpu_kernel void @test_gs_done_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0) + ret void +} + +declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 +declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll index d453d03cded8..495e36b09f8f 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll @@ -2,14 +2,13 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.sffbh.i32(i32) #1 -declare i32 @llvm.AMDGPU.flbit.i32(i32) #1 ; FUNC-LABEL: {{^}}s_flbit: ; GCN: s_load_dword [[VAL:s[0-9]+]], ; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]] ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; GCN: buffer_store_dword [[VRESULT]], -define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 { +define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 { %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val) store i32 %r, i32 addrspace(1)* %out, align 4 ret void @@ -19,36 +18,12 @@ define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 { ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], -define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { +define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { %val = load i32, i32 addrspace(1)* %valptr, align 4 %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val) store i32 %r, i32 addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: {{^}}legacy_s_flbit: -; GCN: s_load_dword [[VAL:s[0-9]+]], -; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]] -; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; GCN: buffer_store_dword [[VRESULT]], -; GCN: s_endpgm -define void @legacy_s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone - store i32 %r, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}legacy_v_flbit: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @legacy_v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone - store i32 %r, i32 addrspace(1)* %out, align 4 - ret void -} - attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll index fac0e352614c..4b930bfa210c 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.sin.f16(half %a) ; VI: v_sin_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sin_f16( +define amdgpu_kernel void @sin_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll index e3692fc5906c..0b7064da23f9 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll @@ -5,7 +5,7 @@ declare float @llvm.amdgcn.sin.f32(float) #0 ; GCN-LABEL: {{^}}v_sin_f32: ; GCN: v_sin_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -define void @v_sin_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @v_sin_f32(float addrspace(1)* %out, float %src) #1 { %sin = call float @llvm.amdgcn.sin.f32(float %src) #0 store float %sin, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll index caac6ddbeb80..e0cec2134e70 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll @@ -9,7 +9,7 @@ declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] ; SI: buffer_store_dwordx2 [[RESULT]], ; SI: s_endpgm -define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load double, double addrspace(1)* %aptr, align 8 %b = load i32, i32 addrspace(1)* %bptr, align 4 %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone @@ -22,7 +22,7 @@ define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 ; SI: buffer_store_dwordx2 [[RESULT]], ; SI: s_endpgm -define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { +define amdgpu_kernel void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { %a = load double, double addrspace(1)* %aptr, align 8 %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone store double %result, double addrspace(1)* %out, align 8 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll new file mode 100644 index 000000000000..92e3a1099da0 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -0,0 +1,623 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg: +; GCN: v_bfe_u32 +define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_arg_arg_imm: +; GCN: v_bfe_u32 +define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_arg_imm_arg: +; GCN: v_bfe_u32 +define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_imm_arg_arg: +; GCN: v_bfe_u32 +define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset: +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset: +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_zextload_i8: +; GCN: buffer_load_ubyte +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { + %load = load i8, i8 addrspace(1)* %in + %ext = zext i8 %load to i32 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8: +; GCN: buffer_load_dword +; GCN: v_add_i32 +; GCN-NEXT: v_and_b32_e32 +; FIXME: Should be using s_add_i32 +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i16: +; GCN: buffer_load_dword +; GCN: v_add_i32 +; GCN-NEXT: v_and_b32_e32 +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 16) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1: +; GCN: buffer_load_dword +; GCN: v_add_i32 +; GCN: bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 1, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3: +; GCN: buffer_load_dword +; GCN: v_add_i32 +; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8 +; GCN-NEXT: bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 3, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7: +; GCN: buffer_load_dword +; GCN: v_add_i32 +; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80 +; GCN-NEXT: bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 7, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8: +; GCN: buffer_load_dword +; GCN: v_add_i32 +; GCN-NEXT: bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 8, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_1: +; GCN: buffer_load_dword +; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_4: +; GCN-NOT: lshl +; GCN-NOT: shr +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = lshr i32 %shl, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_5: +; GCN: buffer_load_dword +; GCN-NOT: lshl +; GCN-NOT: shr +; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_6: +; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_7: +; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_8: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_9: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_10: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_11: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_12: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_13: +; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; GCN-LABEL: {{^}}bfe_u32_test_14: +; GCN-NOT: lshr +; GCN-NOT: {{[^@]}}bfe +; GCN: s_endpgm +define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_0: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_1: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_2: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_3: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_4: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_5: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_6: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_7: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_8: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_9: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_10: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_11: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_12: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_13: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_14: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_15: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_16: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_17: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_18: +; GCN-NOT: {{[^@]}}bfe +; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[VREG]], +; GCN: s_endpgm +; EG-NOT: BFE +define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 { + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1) + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure that SimplifyDemandedBits doesn't cause the and to be +; reduced to the bits demanded by the bfe. + +; XXX: The operand to v_bfe_u32 could also just directly be the load register. +; GCN-LABEL: {{^}}simplify_bfe_u32_multi_use_arg: +; GCN: buffer_load_dword [[ARG:v[0-9]+]] +; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]] +; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2 +; GCN-DAG: buffer_store_dword [[AND]] +; GCN-DAG: buffer_store_dword [[BFE]] +; GCN: s_endpgm +define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, + i32 addrspace(1)* %out1, + i32 addrspace(1)* %in) #0 { + %src = load i32, i32 addrspace(1)* %in, align 4 + %and = and i32 %src, 63 + %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %and, i32 2, i32 2) + store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4 + store i32 %and, i32 addrspace(1)* %out1, align 4 + ret void +} + +; GCN-LABEL: {{^}}lshr_and: +; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 +; GCN: buffer_store_dword +define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 { + %b = lshr i32 %a, 6 + %c = and i32 %b, 7 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}v_lshr_and: +; GCN: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3 +; GCN: buffer_store_dword +define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + %c = lshr i32 %a, %b + %d = and i32 %c, 7 + store i32 %d, i32 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}and_lshr: +; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 +; GCN: buffer_store_dword +define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 { + %b = and i32 %a, 448 + %c = lshr i32 %b, 6 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}and_lshr2: +; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 +; GCN: buffer_store_dword +define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 { + %b = and i32 %a, 511 + %c = lshr i32 %b, 6 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}shl_lshr: +; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002 +; GCN: buffer_store_dword +define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 { + %b = shl i32 %a, 9 + %c = lshr i32 %b, 11 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll index e85179755371..e305f8eff587 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll @@ -4,7 +4,7 @@ ; GCN-DAG: ; wave barrier ; GCN-NOT: s_barrier -define void @test_wave_barrier() #0 { +define amdgpu_kernel void @test_wave_barrier() #0 { entry: call void @llvm.amdgcn.wave.barrier() #1 ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll index 58529b874442..349e7f0f0e8d 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll @@ -34,7 +34,7 @@ declare i32 @llvm.amdgcn.workgroup.id.z() #0 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %id, i32 addrspace(1)* %out ret void @@ -61,7 +61,7 @@ define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 { ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workgroup.id.y() store i32 %id, i32 addrspace(1)* %out ret void @@ -96,7 +96,7 @@ define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 { ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 -define void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workgroup.id.z() store i32 %id, i32 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll index 1f18173f40a4..8b80998cab6f 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll @@ -18,7 +18,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0 ; ALL-NOT: v0 ; ALL: {{buffer|flat}}_store_dword {{.*}}v0 -define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() store i32 %id, i32 addrspace(1)* %out ret void @@ -33,7 +33,7 @@ define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 { ; ALL-NOT: v1 ; ALL: {{buffer|flat}}_store_dword {{.*}}v1 -define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.y() store i32 %id, i32 addrspace(1)* %out ret void @@ -48,7 +48,7 @@ define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 { ; ALL-NOT: v2 ; ALL: {{buffer|flat}}_store_dword {{.*}}v2 -define void @test_workitem_id_z(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.z() store i32 %id, i32 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 112e29ed22a7..0604a49372a2 100644 --- a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -4,7 +4,7 @@ declare half @llvm.ceil.f16(half %a) declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) -; GCN-LABEL: {{^}}ceil_f16 +; GCN-LABEL: {{^}}ceil_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_ceil_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] @@ -12,7 +12,7 @@ declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) ; VI: v_ceil_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @ceil_f16( +define amdgpu_kernel void @ceil_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -22,23 +22,27 @@ entry: ret void } -; GCN-LABEL: {{^}}ceil_v2f16 +; GCN-LABEL: {{^}}ceil_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI: v_ceil_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @ceil_v2f16( +define amdgpu_kernel void @ceil_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.cos.f16.ll index ba354ed0b124..d836ea36ef63 100644 --- a/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -13,7 +13,7 @@ declare <2 x half> @llvm.cos.v2f16(<2 x half> %a) ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @cos_f16( +define amdgpu_kernel void @cos_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -25,26 +25,34 @@ entry: ; GCN-LABEL: {{^}}cos_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}} -; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] -; VI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]] -; GCN: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] -; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] -; VI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] -; GCN: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] -; GCN: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] -; GCN: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}} +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] + +; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]] +; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] + +; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] +; GCN-DAG: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] +; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] + +; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GCN-NOT: and + +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @cos_v2f16( +define amdgpu_kernel void @cos_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.cos.ll b/test/CodeGen/AMDGPU/llvm.cos.ll index eb7dcbbf2346..bd89502d7b82 100644 --- a/test/CodeGen/AMDGPU/llvm.cos.ll +++ b/test/CodeGen/AMDGPU/llvm.cos.ll @@ -11,7 +11,7 @@ ;SI: v_cos_f32 ;SI-NOT: v_cos_f32 -define void @test(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @test(float addrspace(1)* %out, float %x) #1 { %cos = call float @llvm.cos.f32(float %x) store float %cos, float addrspace(1)* %out ret void @@ -29,7 +29,7 @@ define void @test(float addrspace(1)* %out, float %x) #1 { ;SI: v_cos_f32 ;SI-NOT: v_cos_f32 -define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 { +define amdgpu_kernel void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 { %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx) store <4 x float> %cos, <4 x float> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll index 8b0854c2c2b5..c4a76de5989c 100644 --- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -9,7 +9,7 @@ ; CHECK: buffer_store_dword ; CHECK: s_endpgm -define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { +define amdgpu_kernel void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { entry: tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, i64 0, metadata !10, metadata !13), !dbg !14 store i32 123, i32 addrspace(1)* %globalptr_arg, align 4 diff --git a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll index 7fa56911efdc..5757142b9e95 100644 --- a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll @@ -12,7 +12,7 @@ declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a) ; VI: v_exp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @exp2_f16( +define amdgpu_kernel void @exp2_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -25,20 +25,24 @@ entry: ; GCN-LABEL: {{^}}exp2_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_exp_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_exp_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI: v_exp_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @exp2_v2f16( +define amdgpu_kernel void @exp2_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.exp2.ll b/test/CodeGen/AMDGPU/llvm.exp2.ll index 42698925aae4..387dc3b8566a 100644 --- a/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -1,6 +1,6 @@ ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC ;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC +;RUN: llc < %s -march=amdgcn | FileCheck %s --check-prefix=SI --check-prefix=FUNC ;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC ;FUNC-LABEL: {{^}}test: @@ -11,7 +11,7 @@ ;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} ;SI: v_exp_f32 -define void @test(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) { entry: %0 = call float @llvm.exp2.f32(float %in) store float %0, float addrspace(1)* %out @@ -34,7 +34,7 @@ entry: ;SI: v_exp_f32 ;SI: v_exp_f32 -define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) store <2 x float> %0, <2 x float> addrspace(1)* %out @@ -68,7 +68,7 @@ entry: ;SI: v_exp_f32 ;SI: v_exp_f32 ;SI: v_exp_f32 -define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in) store <4 x float> %0, <4 x float> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/test/CodeGen/AMDGPU/llvm.floor.f16.ll index 60dfd734ee73..6a18141d8035 100644 --- a/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -12,7 +12,7 @@ declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) ; VI: v_floor_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @floor_f16( +define amdgpu_kernel void @floor_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -25,20 +25,24 @@ entry: ; GCN-LABEL: {{^}}floor_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI: v_floor_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @floor_v2f16( +define amdgpu_kernel void @floor_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll index 3431267e3943..518fe8baaa7a 100644 --- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -16,7 +16,7 @@ declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fma_f16( +define amdgpu_kernel void @fma_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -32,7 +32,8 @@ define void @fma_f16( ; GCN-LABEL: {{^}}fma_f16_imm_a ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} + +; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] @@ -41,7 +42,7 @@ define void @fma_f16( ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fma_f16_imm_a( +define amdgpu_kernel void @fma_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b, half addrspace(1)* %c) { @@ -55,7 +56,7 @@ define void @fma_f16_imm_a( ; GCN-LABEL: {{^}}fma_f16_imm_b ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}} +; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] @@ -64,7 +65,7 @@ define void @fma_f16_imm_a( ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fma_f16_imm_b( +define amdgpu_kernel void @fma_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c) { @@ -78,7 +79,7 @@ define void @fma_f16_imm_b( ; GCN-LABEL: {{^}}fma_f16_imm_c ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x4200{{$}} +; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] @@ -87,7 +88,7 @@ define void @fma_f16_imm_b( ; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @fma_f16_imm_c( +define amdgpu_kernel void @fma_f16_imm_c( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -102,27 +103,35 @@ define void @fma_f16_imm_c( ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] + +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + +; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] -; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] + +; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] + +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN-NOT: and +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fma_v2f16( +define amdgpu_kernel void @fma_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -135,29 +144,33 @@ define void @fma_v2f16( ret void } -; GCN-LABEL: {{^}}fma_v2f16_imm_a +; GCN-LABEL: {{^}}fma_v2f16_imm_a: ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} + +; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] + +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] ; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_F16]], v[[C_V2_F16]] -; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16]], v[[C_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] + +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]] + +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN-NOT: and +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fma_v2f16_imm_a( +define amdgpu_kernel void @fma_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b, <2 x half> addrspace(1)* %c) { @@ -168,29 +181,39 @@ define void @fma_v2f16_imm_a( ret void } -; GCN-LABEL: {{^}}fma_v2f16_imm_b -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}} +; GCN-LABEL: {{^}}fma_v2f16_imm_b: +; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] + +; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] + +; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]] -; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] + +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] +; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] + +; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]] + +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN-NOT: and +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fma_v2f16_imm_b( +define amdgpu_kernel void @fma_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %c) { @@ -201,29 +224,39 @@ define void @fma_v2f16_imm_b( ret void } -; GCN-LABEL: {{^}}fma_v2f16_imm_c -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x4200{{$}} +; GCN-LABEL: {{^}}fma_v2f16_imm_c: +; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] + +; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] + +; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] -; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] + +; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] + +; GCN-NOT: and +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fma_v2f16_imm_c( +define amdgpu_kernel void @fma_v2f16_imm_c( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 3bc85bdc29ef..f30fd1d58204 100644 --- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s -; RUN: llc -march=amdgcn -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s +; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s +; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -24,7 +24,7 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> ; VI-DENORM: buffer_store_short [[RESULT]] ; GCN: s_endpgm -define void @fmuladd_f16( +define amdgpu_kernel void @fmuladd_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -40,10 +40,9 @@ define void @fmuladd_f16( ; GCN-LABEL: {{^}}fmuladd_f16_imm_a ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] +; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] ; SI: buffer_store_short v[[R_F16]] @@ -55,7 +54,7 @@ define void @fmuladd_f16( ; VI-DENORM: buffer_store_short [[RESULT]] ; GCN: s_endpgm -define void @fmuladd_f16_imm_a( +define amdgpu_kernel void @fmuladd_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b, half addrspace(1)* %c) { @@ -69,10 +68,9 @@ define void @fmuladd_f16_imm_a( ; GCN-LABEL: {{^}}fmuladd_f16_imm_b ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]] +; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] ; SI: buffer_store_short v[[R_F16]] @@ -85,7 +83,7 @@ define void @fmuladd_f16_imm_a( ; GCN: s_endpgm -define void @fmuladd_f16_imm_b( +define amdgpu_kernel void @fmuladd_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c) { @@ -100,38 +98,45 @@ define void @fmuladd_f16_imm_b( ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] ; SI: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[C_F32_0]] ; SI: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] - - -; FIXME: and should be unnecessary -; VI-FLUSH: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI-FLUSH: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] -; VI-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]] -; VI-FLUSH: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] - +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]] +; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]] +; VI-FLUSH-NOT: v_and_b32 +; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]] + +; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; VI-DENORM: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] ; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] -; VI-DENORM: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[RES0]] -; VI-DENORM: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] +; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] +; VI-DENORM-NOT: v_and_b32 +; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[RES0]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @fmuladd_v2f16( + +define amdgpu_kernel void @fmuladd_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, diff --git a/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/test/CodeGen/AMDGPU/llvm.log2.f16.ll index 8d1a8973cb4e..773eb55283e4 100644 --- a/test/CodeGen/AMDGPU/llvm.log2.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.log2.f16.ll @@ -12,7 +12,7 @@ declare <2 x half> @llvm.log2.v2f16(<2 x half> %a) ; VI: v_log_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @log2_f16( +define amdgpu_kernel void @log2_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -24,21 +24,25 @@ entry: ; GCN-LABEL: {{^}}log2_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] -; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI: v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI-DAG: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @log2_v2f16( +define amdgpu_kernel void @log2_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.log2.ll b/test/CodeGen/AMDGPU/llvm.log2.ll index c75e7850b353..b9d593e43f32 100644 --- a/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/test/CodeGen/AMDGPU/llvm.log2.ll @@ -11,7 +11,7 @@ ;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ;SI: v_log_f32 -define void @test(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) { entry: %0 = call float @llvm.log2.f32(float %in) store float %0, float addrspace(1)* %out @@ -34,7 +34,7 @@ entry: ;SI: v_log_f32 ;SI: v_log_f32 -define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) store <2 x float> %0, <2 x float> addrspace(1)* %out @@ -68,7 +68,7 @@ entry: ;SI: v_log_f32 ;SI: v_log_f32 ;SI: v_log_f32 -define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in) store <4 x float> %0, <4 x float> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 8adc01b7b8c7..4c8dff52509a 100644 --- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -4,7 +4,7 @@ declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) -; GCN-LABEL: {{^}}maxnum_f16 +; GCN-LABEL: {{^}}maxnum_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] @@ -14,7 +14,7 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @maxnum_f16( +define amdgpu_kernel void @maxnum_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -26,16 +26,15 @@ entry: ret void } -; GCN-LABEL: {{^}}maxnum_f16_imm_a +; GCN-LABEL: {{^}}maxnum_f16_imm_a: ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @maxnum_f16_imm_a( +define amdgpu_kernel void @maxnum_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -45,16 +44,15 @@ entry: ret void } -; GCN-LABEL: {{^}}maxnum_f16_imm_b +; GCN-LABEL: {{^}}maxnum_f16_imm_b: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @maxnum_f16_imm_b( +define amdgpu_kernel void @maxnum_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -64,27 +62,33 @@ entry: ret void } -; GCN-LABEL: {{^}}maxnum_v2f16 +; GCN-LABEL: {{^}}maxnum_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @maxnum_v2f16( +define amdgpu_kernel void @maxnum_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -96,25 +100,24 @@ entry: ret void } -; GCN-LABEL: {{^}}maxnum_v2f16_imm_a +; GCN-LABEL: {{^}}maxnum_v2f16_imm_a: ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] +; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] +; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] + +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN-NOT: and +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @maxnum_v2f16_imm_a( +define amdgpu_kernel void @maxnum_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -124,25 +127,24 @@ entry: ret void } -; GCN-LABEL: {{^}}maxnum_v2f16_imm_b +; GCN-LABEL: {{^}}maxnum_v2f16_imm_b: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}} -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] -; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] + +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN-NOT: and +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @maxnum_v2f16_imm_b( +define amdgpu_kernel void @maxnum_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll index 009338d273f5..7b4db55155eb 100644 --- a/test/CodeGen/AMDGPU/llvm.memcpy.ll +++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -80,7 +80,7 @@ declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace ; SI-DAG: ds_write_b8 ; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind @@ -125,7 +125,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias % ; SI-DAG: ds_write_b16 ; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind @@ -144,7 +144,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias % ; SI: ds_write2_b32 ; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind @@ -161,7 +161,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias % ; SI: ds_write2_b64 ; SI-DAG: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind @@ -238,7 +238,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias % ; SI-DAG: buffer_store_byte ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind @@ -281,7 +281,7 @@ define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noa ; SI-DAG: buffer_store_short ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind @@ -294,7 +294,7 @@ define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noa ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind @@ -307,7 +307,7 @@ define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noa ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind @@ -320,7 +320,7 @@ define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noa ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind @@ -340,7 +340,7 @@ define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* no ; SI-DAG: s_load_dwordx2 ; SI-DAG: buffer_store_dwordx4 ; SI-DAG: buffer_store_dwordx4 -define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind { +define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind { %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)* call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 4, i1 false) ret void @@ -365,7 +365,7 @@ define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) noun ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI: buffer_store_byte -define void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind { +define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind { %str = bitcast [16 x i8] addrspace(2)* @hello.align1 to i8 addrspace(2)* call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 1, i1 false) ret void diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 4cc1deb2095c..b8221356b664 100644 --- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -4,7 +4,7 @@ declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) -; GCN-LABEL: {{^}}minnum_f16 +; GCN-LABEL: {{^}}minnum_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] @@ -14,7 +14,7 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @minnum_f16( +define amdgpu_kernel void @minnum_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -26,16 +26,15 @@ entry: ret void } -; GCN-LABEL: {{^}}minnum_f16_imm_a +; GCN-LABEL: {{^}}minnum_f16_imm_a: ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] +; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @minnum_f16_imm_a( +define amdgpu_kernel void @minnum_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -45,16 +44,15 @@ entry: ret void } -; GCN-LABEL: {{^}}minnum_f16_imm_b +; GCN-LABEL: {{^}}minnum_f16_imm_b: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @minnum_f16_imm_b( +define amdgpu_kernel void @minnum_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -64,27 +62,32 @@ entry: ret void } -; GCN-LABEL: {{^}}minnum_v2f16 +; GCN-LABEL: {{^}}minnum_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @minnum_v2f16( +define amdgpu_kernel void @minnum_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -96,25 +99,27 @@ entry: ret void } -; GCN-LABEL: {{^}}minnum_v2f16_imm_a +; GCN-LABEL: {{^}}minnum_v2f16_imm_a: ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}} -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}} + ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] +; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] + +; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] +; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] + +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN-NOT: and +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @minnum_v2f16_imm_a( +define amdgpu_kernel void @minnum_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -124,25 +129,24 @@ entry: ret void } -; GCN-LABEL: {{^}}minnum_v2f16_imm_b +; GCN-LABEL: {{^}}minnum_v2f16_imm_b: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}} -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] +; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] -; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] + +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GCN-NOT: and +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @minnum_v2f16_imm_b( +define amdgpu_kernel void @minnum_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.r600.cube.ll b/test/CodeGen/AMDGPU/llvm.r600.cube.ll new file mode 100644 index 000000000000..b5a0de95acf5 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.r600.cube.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s + +; CHECK-LABEL: {{^}}cube: +; CHECK: CUBE T{{[0-9]}}.X +; CHECK: CUBE T{{[0-9]}}.Y +; CHECK: CUBE T{{[0-9]}}.Z +; CHECK: CUBE * T{{[0-9]}}.W +define amdgpu_ps void @cube() { +main_body: + %tmp = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %tmp1 = extractelement <4 x float> %tmp, i32 3 + %tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %tmp3 = extractelement <4 x float> %tmp2, i32 0 + %tmp4 = fdiv float %tmp3, %tmp1 + %tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %tmp6 = extractelement <4 x float> %tmp5, i32 1 + %tmp7 = fdiv float %tmp6, %tmp1 + %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %tmp9 = extractelement <4 x float> %tmp8, i32 2 + %tmp10 = fdiv float %tmp9, %tmp1 + %tmp11 = insertelement <4 x float> undef, float %tmp4, i32 0 + %tmp12 = insertelement <4 x float> %tmp11, float %tmp7, i32 1 + %tmp13 = insertelement <4 x float> %tmp12, float %tmp10, i32 2 + %tmp14 = insertelement <4 x float> %tmp13, float 1.000000e+00, i32 3 + %tmp15 = call <4 x float> @llvm.r600.cube(<4 x float> %tmp14) + %tmp16 = extractelement <4 x float> %tmp15, i32 0 + %tmp17 = extractelement <4 x float> %tmp15, i32 1 + %tmp18 = extractelement <4 x float> %tmp15, i32 2 + %tmp19 = extractelement <4 x float> %tmp15, i32 3 + %tmp20 = call float @llvm.fabs.f32(float %tmp18) + %tmp21 = fdiv float 1.000000e+00, %tmp20 + %tmp22 = fmul float %tmp16, %tmp21 + %tmp23 = fadd float %tmp22, 1.500000e+00 + %tmp24 = fmul float %tmp17, %tmp21 + %tmp25 = fadd float %tmp24, 1.500000e+00 + %tmp26 = insertelement <4 x float> undef, float %tmp25, i32 0 + %tmp27 = insertelement <4 x float> %tmp26, float %tmp23, i32 1 + %tmp28 = insertelement <4 x float> %tmp27, float %tmp19, i32 2 + %tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 3 + %tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> + %tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1) + call void @llvm.r600.store.swizzle(<4 x float> %tmp31, i32 0, i32 0) + ret void +} + +; Function Attrs: readnone +declare <4 x float> @llvm.r600.cube(<4 x float>) #0 + +; Function Attrs: nounwind readnone +declare float @llvm.fabs.f32(float) #0 + +declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) + +; Function Attrs: readnone +declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.r600.dot4.ll b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll index 4db29c58385e..de8a47741c94 100644 --- a/test/CodeGen/AMDGPU/llvm.r600.dot4.ll +++ b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll @@ -2,7 +2,7 @@ declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone -define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind { +define amdgpu_kernel void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind { %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16 %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16 %dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone diff --git a/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll index e4e6dd8e1069..93caafbb9524 100644 --- a/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll +++ b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll @@ -2,7 +2,7 @@ ; EG-LABEL: {{^}}test_group_barrier: ; EG: GROUP_BARRIER -define void @test_group_barrier(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_group_barrier(i32 addrspace(1)* %out) #0 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll index a5b07e072fa5..82c42601ef1e 100644 --- a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -14,7 +14,7 @@ ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @local_size_x(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -29,7 +29,7 @@ entry: ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @local_size_y(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -44,7 +44,7 @@ entry: ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @local_size_z(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.local.size.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -59,7 +59,7 @@ entry: ; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]] ; GCN: buffer_store_dword [[VAL]] -define void @local_size_xy(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_xy(i32 addrspace(1)* %out) { entry: %x = call i32 @llvm.r600.read.local.size.x() #0 %y = call i32 @llvm.r600.read.local.size.y() #0 @@ -78,7 +78,7 @@ entry: ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]] ; GCN: buffer_store_dword [[VAL]] -define void @local_size_xz(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_xz(i32 addrspace(1)* %out) { entry: %x = call i32 @llvm.r600.read.local.size.x() #0 %z = call i32 @llvm.r600.read.local.size.z() #0 @@ -98,7 +98,7 @@ entry: ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]] ; GCN: buffer_store_dword [[VAL]] -define void @local_size_yz(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_yz(i32 addrspace(1)* %out) { entry: %y = call i32 @llvm.r600.read.local.size.y() #0 %z = call i32 @llvm.r600.read.local.size.z() #0 @@ -121,7 +121,7 @@ entry: ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]] ; GCN: buffer_store_dword [[VAL]] -define void @local_size_xyz(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_xyz(i32 addrspace(1)* %out) { entry: %x = call i32 @llvm.r600.read.local.size.x() #0 %y = call i32 @llvm.r600.read.local.size.y() #0 @@ -138,7 +138,7 @@ entry: ; GCN-NOT: 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_x_known_bits(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_x_known_bits(i32 addrspace(1)* %out) { entry: %size = call i32 @llvm.r600.read.local.size.x() #0 %shl = shl i32 %size, 16 @@ -153,7 +153,7 @@ entry: ; GCN-NOT: 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_y_known_bits(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_y_known_bits(i32 addrspace(1)* %out) { entry: %size = call i32 @llvm.r600.read.local.size.y() #0 %shl = shl i32 %size, 16 @@ -168,7 +168,7 @@ entry: ; GCN-NOT: 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_z_known_bits(i32 addrspace(1)* %out) { +define amdgpu_kernel void @local_size_z_known_bits(i32 addrspace(1)* %out) { entry: %size = call i32 @llvm.r600.read.local.size.z() #0 %shl = shl i32 %size, 16 diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll index 1c6e7950e9b7..90d076d4fb4d 100644 --- a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll +++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll @@ -4,7 +4,7 @@ declare float @llvm.r600.recipsqrt.clamped.f32(float) nounwind readnone ; EG-LABEL: {{^}}rsq_clamped_f32: ; EG: RECIPSQRT_CLAMPED -define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { +define amdgpu_kernel void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { %rsq_clamped = call float @llvm.r600.recipsqrt.clamped.f32(float %src) store float %rsq_clamped, float addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll index 1d6bff01e662..d9177b39b8ac 100644 --- a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll +++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll @@ -4,7 +4,7 @@ declare float @llvm.r600.recipsqrt.ieee.f32(float) nounwind readnone ; EG-LABEL: {{^}}recipsqrt.ieee_f32: ; EG: RECIPSQRT_IEEE -define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind { +define amdgpu_kernel void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind { %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float %src) nounwind readnone store float %recipsqrt.ieee, float addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind { ; TODO: Really these should be constant folded ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_4.0 ; EG: RECIPSQRT_IEEE -define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind { %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 4.0) nounwind readnone store float %recipsqrt.ieee, float addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_100.0 ; EG: RECIPSQRT_IEEE -define void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind { %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 100.0) nounwind readnone store float %recipsqrt.ieee, float addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/llvm.r600.tex.ll b/test/CodeGen/AMDGPU/llvm.r600.tex.ll index 409037f3e976..98044917e2b0 100644 --- a/test/CodeGen/AMDGPU/llvm.r600.tex.ll +++ b/test/CodeGen/AMDGPU/llvm.r600.tex.ll @@ -17,7 +17,7 @@ ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN -define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { bb: %addr = load <4 x float>, <4 x float> addrspace(1)* %in %tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 3657940f36fd..59e81a7acc0b 100644 --- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s declare half @llvm.rint.f16(half %a) declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) @@ -9,10 +10,10 @@ declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_rndne_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GFX89: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @rint_f16( +define amdgpu_kernel void @rint_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -25,20 +26,30 @@ entry: ; GCN-LABEL: {{^}}rint_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_and_b32 +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: v_and_b32 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + +; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GFX9: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @rint_v2f16( +define amdgpu_kernel void @rint_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/test/CodeGen/AMDGPU/llvm.rint.f64.ll index c63fb1727940..30ce8ed83ff1 100644 --- a/test/CodeGen/AMDGPU/llvm.rint.f64.ll +++ b/test/CodeGen/AMDGPU/llvm.rint.f64.ll @@ -11,7 +11,7 @@ ; SI: v_cndmask_b32 ; SI: v_cndmask_b32 ; SI: s_endpgm -define void @rint_f64(double addrspace(1)* %out, double %in) { +define amdgpu_kernel void @rint_f64(double addrspace(1)* %out, double %in) { entry: %0 = call double @llvm.rint.f64(double %in) store double %0, double addrspace(1)* %out @@ -21,7 +21,7 @@ entry: ; FUNC-LABEL: {{^}}rint_v2f64: ; CI: v_rndne_f64_e32 ; CI: v_rndne_f64_e32 -define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +define amdgpu_kernel void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { entry: %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in) store <2 x double> %0, <2 x double> addrspace(1)* %out @@ -33,7 +33,7 @@ entry: ; CI: v_rndne_f64_e32 ; CI: v_rndne_f64_e32 ; CI: v_rndne_f64_e32 -define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +define amdgpu_kernel void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { entry: %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in) store <4 x double> %0, <4 x double> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/llvm.rint.ll b/test/CodeGen/AMDGPU/llvm.rint.ll index cf7c0e4c6fb6..4056bc39448d 100644 --- a/test/CodeGen/AMDGPU/llvm.rint.ll +++ b/test/CodeGen/AMDGPU/llvm.rint.ll @@ -6,7 +6,7 @@ ; R600: RNDNE ; SI: v_rndne_f32_e32 -define void @rint_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @rint_f32(float addrspace(1)* %out, float %in) { entry: %0 = call float @llvm.rint.f32(float %in) #0 store float %0, float addrspace(1)* %out @@ -19,7 +19,7 @@ entry: ; SI: v_rndne_f32_e32 ; SI: v_rndne_f32_e32 -define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +define amdgpu_kernel void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0 store <2 x float> %0, <2 x float> addrspace(1)* %out @@ -36,7 +36,7 @@ entry: ; SI: v_rndne_f32_e32 ; SI: v_rndne_f32_e32 ; SI: v_rndne_f32_e32 -define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +define amdgpu_kernel void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0 store <4 x float> %0, <4 x float> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll index 3ea4551f0ee7..c58b9b4d9e94 100644 --- a/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -2,7 +2,7 @@ ; FUNC-LABEL: {{^}}round_f64: ; SI: s_endpgm -define void @round_f64(double addrspace(1)* %out, double %x) #0 { +define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 { %result = call double @llvm.round.f64(double %x) #1 store double %result, double addrspace(1)* %out ret void @@ -26,7 +26,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 { ; SI: buffer_store_dwordx2 ; SI: s_endpgm -define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid @@ -38,7 +38,7 @@ define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 ; FUNC-LABEL: {{^}}round_v2f64: ; SI: s_endpgm -define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { +define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 store <2 x double> %result, <2 x double> addrspace(1)* %out ret void @@ -46,7 +46,7 @@ define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { ; FUNC-LABEL: {{^}}round_v4f64: ; SI: s_endpgm -define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { +define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, <4 x double> addrspace(1)* %out ret void @@ -54,7 +54,7 @@ define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { ; FUNC-LABEL: {{^}}round_v8f64: ; SI: s_endpgm -define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { +define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, <8 x double> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll index 7e8f8ff172e8..ffe87977870b 100644 --- a/test/CodeGen/AMDGPU/llvm.round.ll +++ b/test/CodeGen/AMDGPU/llvm.round.ll @@ -1,18 +1,19 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}round_f32: -; SI-DAG: s_load_dword [[SX:s[0-9]+]] -; SI-DAG: s_brev_b32 [[K:s[0-9]+]], -2{{$}} -; SI-DAG: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] -; SI-DAG: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] -; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] -; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] -; SI: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5 -; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]] -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] -; SI: buffer_store_dword [[RESULT]] +; GCN-DAG: s_load_dword [[SX:s[0-9]+]] +; GCN-DAG: s_brev_b32 [[K:s[0-9]+]], -2{{$}} +; GCN-DAG: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] +; GCN-DAG: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] +; GCN-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] +; GCN: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] +; GCN: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5 +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]] +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; GCN: buffer_store_dword [[RESULT]] ; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] ; R600-DAG: ADD {{.*}}, @@ -20,7 +21,7 @@ ; R600-DAG: SETGE ; R600-DAG: CNDE ; R600-DAG: ADD -define void @round_f32(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @round_f32(float addrspace(1)* %out, float %x) #0 { %result = call float @llvm.round.f32(float %x) #1 store float %result, float addrspace(1)* %out ret void @@ -32,36 +33,78 @@ define void @round_f32(float addrspace(1)* %out, float %x) #0 { ; compiler doesn't crash. ; FUNC-LABEL: {{^}}round_v2f32: -; SI: s_endpgm +; GCN: s_endpgm ; R600: CF_END -define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { +define amdgpu_kernel void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 store <2 x float> %result, <2 x float> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}round_v4f32: -; SI: s_endpgm +; GCN: s_endpgm ; R600: CF_END -define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { +define amdgpu_kernel void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 store <4 x float> %result, <4 x float> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}round_v8f32: -; SI: s_endpgm +; GCN: s_endpgm ; R600: CF_END -define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { +define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1 store <8 x float> %result, <8 x float> addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}round_f16: +; GFX89-DAG: s_load_dword [[SX:s[0-9]+]] +; GFX89-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7fff{{$}} +; GFX89-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] +; GFX89-DAG: v_mov_b32_e32 [[BFI_K:v[0-9]+]], 0x3c00 +; GFX89: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], [[BFI_K]], [[VX]] + +; GFX89: v_trunc_f16_e32 [[TRUNC:v[0-9]+]], [[SX]] +; GFX89: v_sub_f16_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] +; GFX89: v_cmp_ge_f16_e64 vcc, |[[SUB]]|, 0.5 +; GFX89: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[COPYSIGN]] +; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; GFX89: buffer_store_short [[RESULT]] +define amdgpu_kernel void @round_f16(half addrspace(1)* %out, i32 %x.arg) #0 { + %x.arg.trunc = trunc i32 %x.arg to i16 + %x = bitcast i16 %x.arg.trunc to half + %result = call half @llvm.round.f16(half %x) #1 + store half %result, half addrspace(1)* %out + ret void +} + +; Should be scalarized +; FUNC-LABEL: {{^}}round_v2f16: +; GFX89-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7fff{{$}} +; GFX89-DAG: v_mov_b32_e32 [[BFI_K:v[0-9]+]], 0x3c00 +; GFX89: v_bfi_b32 [[COPYSIGN0:v[0-9]+]], [[K]], [[BFI_K]], +; GFX89: v_bfi_b32 [[COPYSIGN1:v[0-9]+]], [[K]], [[BFI_K]], + +; GFX9: v_and_b32_e32 +; GFX9: v_lshl_or_b32 +define amdgpu_kernel void @round_v2f16(<2 x half> addrspace(1)* %out, i32 %in.arg) #0 { + %in = bitcast i32 %in.arg to <2 x half> + %result = call <2 x half> @llvm.round.v2f16(<2 x half> %in) + store <2 x half> %result, <2 x half> addrspace(1)* %out + ret void +} + declare float @llvm.round.f32(float) #1 declare <2 x float> @llvm.round.v2f32(<2 x float>) #1 declare <4 x float> @llvm.round.v4f32(<4 x float>) #1 declare <8 x float> @llvm.round.v8f32(<8 x float>) #1 +declare half @llvm.round.f16(half) #1 +declare <2 x half> @llvm.round.v2f16(<2 x half>) #1 +declare <4 x half> @llvm.round.v4f16(<4 x half>) #1 +declare <8 x half> @llvm.round.v8f16(<8 x half>) #1 + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.sin.f16.ll index b01932f69b06..eb1f32c981f8 100644 --- a/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -13,7 +13,7 @@ declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sin_f16( +define amdgpu_kernel void @sin_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -26,25 +26,35 @@ entry: ; GCN-LABEL: {{^}}sin_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}} -; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] -; VI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]] -; GCN: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] -; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] -; VI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] -; GCN: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] -; GCN: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] -; GCN: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] +; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] +; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] + +; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]] +; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] +; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] + +; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] +; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] +; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] + +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @sin_v2f16( +define amdgpu_kernel void @sin_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.sin.ll b/test/CodeGen/AMDGPU/llvm.sin.ll index 04754396a0f7..2a17303267ba 100644 --- a/test/CodeGen/AMDGPU/llvm.sin.ll +++ b/test/CodeGen/AMDGPU/llvm.sin.ll @@ -12,7 +12,7 @@ ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @sin_f32(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @sin_f32(float addrspace(1)* %out, float %x) #1 { %sin = call float @llvm.sin.f32(float %x) store float %sin, float addrspace(1)* %out ret void @@ -24,7 +24,7 @@ define void @sin_f32(float addrspace(1)* %out, float %x) #1 { ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 { %y = fmul float 3.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -38,7 +38,7 @@ define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 { ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 { +define amdgpu_kernel void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 { %y = fmul float 3.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -51,7 +51,7 @@ define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 { ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 { %y = fmul float 2.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -65,7 +65,7 @@ define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 { ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 { +define amdgpu_kernel void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 { %y = fmul float 2.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -78,7 +78,7 @@ define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 { ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 { +define amdgpu_kernel void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 { %y = fmul float 2.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -91,7 +91,7 @@ define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 { ; SI: v_fract_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 { +define amdgpu_kernel void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 { %y = fmul float 2.0, %x %sin = call float @llvm.sin.f32(float %y) store float %sin, float addrspace(1)* %out @@ -110,7 +110,7 @@ define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 { ; SI: v_sin_f32 ; SI: v_sin_f32 ; SI-NOT: v_sin_f32 -define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 { +define amdgpu_kernel void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 { %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx) store <4 x float> %sin, <4 x float> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index 69125b0bcfdc..46ee6526aca2 100644 --- a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -12,7 +12,7 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) ; VI: v_sqrt_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sqrt_f16( +define amdgpu_kernel void @sqrt_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -25,20 +25,24 @@ entry: ; GCN-LABEL: {{^}}sqrt_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_sqrt_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_sqrt_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI: v_sqrt_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_and_b32 +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: v_and_b32 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @sqrt_v2f16( +define amdgpu_kernel void @sqrt_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index 9f84b432209d..dc7182aa0d89 100644 --- a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -12,7 +12,7 @@ declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) ; VI: v_trunc_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @trunc_f16( +define amdgpu_kernel void @trunc_f16( half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -25,20 +25,24 @@ entry: ; GCN-LABEL: {{^}}trunc_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI: v_trunc_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_and_b32 +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: v_and_b32 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @trunc_v2f16( +define amdgpu_kernel void @trunc_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/load-constant-f64.ll b/test/CodeGen/AMDGPU/load-constant-f64.ll index 1b42a9e96e01..0050d1a4f874 100644 --- a/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -6,7 +6,7 @@ ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}] ; GCN-NOHSA: buffer_store_dwordx2 ; GCN-HSA: flat_store_dwordx2 -define void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { %ld = load double, double addrspace(2)* %in store double %ld, double addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/load-constant-i1.ll b/test/CodeGen/AMDGPU/load-constant-i1.ll index 104af10036c1..c8abe5c77ee5 100644 --- a/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -9,56 +9,56 @@ ; EG: VTX_READ_8 ; EG: AND_INT -define void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %load = load i1, i1 addrspace(2)* %in store i1 %load, i1 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v2i1: -define void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in store <2 x i1> %load, <2 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v3i1: -define void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in store <3 x i1> %load, <3 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v4i1: -define void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in store <4 x i1> %load, <4 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v8i1: -define void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in store <8 x i1> %load, <8 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v16i1: -define void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in store <16 x i1> %load, <16 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v32i1: -define void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in store <32 x i1> %load, <32 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v64i1: -define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in store <64 x i1> %load, <64 x i1> addrspace(1)* %out ret void @@ -67,7 +67,7 @@ define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspa ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32: ; GCN: buffer_load_ubyte ; GCN: buffer_store_dword -define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %a = load i1, i1 addrspace(2)* %in %ext = zext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -81,7 +81,7 @@ define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2) ; EG: VTX_READ_8 ; EG: BFE_INT -define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %a = load i1, i1 addrspace(2)* %in %ext = sext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -89,7 +89,7 @@ define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2) } ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32: -define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(2)* %in %ext = zext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -97,7 +97,7 @@ define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x } ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32: -define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(2)* %in %ext = sext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -105,7 +105,7 @@ define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x } ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32: -define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in %ext = zext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -113,7 +113,7 @@ define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x } ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32: -define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in %ext = sext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -121,7 +121,7 @@ define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x } ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32: -define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in %ext = zext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out @@ -129,7 +129,7 @@ define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x } ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32: -define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in %ext = sext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out @@ -137,7 +137,7 @@ define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x } ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32: -define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in %ext = zext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -145,7 +145,7 @@ define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x } ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32: -define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in %ext = sext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -153,7 +153,7 @@ define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x } ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32: -define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in %ext = zext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -161,7 +161,7 @@ define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x } ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32: -define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in %ext = sext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -169,7 +169,7 @@ define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x } ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32: -define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in %ext = zext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -177,7 +177,7 @@ define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <1 } ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32: -define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in %ext = sext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -185,7 +185,7 @@ define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <1 } ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32: -define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in %ext = zext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -193,7 +193,7 @@ define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <3 } ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32: -define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in %ext = sext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -201,7 +201,7 @@ define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <3 } ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32: -define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in %ext = zext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -209,7 +209,7 @@ define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <6 } ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32: -define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in %ext = sext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -221,7 +221,7 @@ define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <6 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]] ; GCN: buffer_store_dwordx2 -define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %a = load i1, i1 addrspace(2)* %in %ext = zext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -233,7 +233,7 @@ define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2) ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] ; GCN: buffer_store_dwordx2 -define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { %a = load i1, i1 addrspace(2)* %in %ext = sext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -241,7 +241,7 @@ define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2) } ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64: -define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(2)* %in %ext = zext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -249,7 +249,7 @@ define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x } ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64: -define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(2)* %in %ext = sext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -257,7 +257,7 @@ define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x } ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64: -define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in %ext = zext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -265,7 +265,7 @@ define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x } ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64: -define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(2)* %in %ext = sext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -273,7 +273,7 @@ define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x } ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64: -define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in %ext = zext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out @@ -281,7 +281,7 @@ define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x } ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64: -define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(2)* %in %ext = sext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out @@ -289,7 +289,7 @@ define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x } ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64: -define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in %ext = zext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -297,7 +297,7 @@ define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x } ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64: -define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(2)* %in %ext = sext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -305,7 +305,7 @@ define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x } ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64: -define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in %ext = zext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -313,7 +313,7 @@ define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x } ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64: -define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(2)* %in %ext = sext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -321,7 +321,7 @@ define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x } ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64: -define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in %ext = zext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -329,7 +329,7 @@ define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <1 } ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64: -define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(2)* %in %ext = sext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -337,7 +337,7 @@ define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <1 } ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64: -define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in %ext = zext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -345,7 +345,7 @@ define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <3 } ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64: -define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(2)* %in %ext = sext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -353,7 +353,7 @@ define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <3 } ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64: -define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in %ext = zext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -361,7 +361,7 @@ define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <6 } ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64: -define void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { +define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(2)* %in %ext = sext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll index f7be1291040f..5dd2efdf6382 100644 --- a/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -8,7 +8,7 @@ ; GCN-HSA: flat_load_ushort ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) { entry: %ld = load i16, i16 addrspace(2)* %in store i16 %ld, i16 addrspace(1)* %out @@ -19,7 +19,7 @@ entry: ; GCN: s_load_dword s ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) { entry: %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in store <2 x i16> %ld, <2 x i16> addrspace(1)* %out @@ -31,7 +31,7 @@ entry: ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 -define void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in store <3 x i16> %ld, <3 x i16> addrspace(1)* %out @@ -42,7 +42,7 @@ entry: ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) { entry: %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in store <4 x i16> %ld, <4 x i16> addrspace(1)* %out @@ -53,7 +53,7 @@ entry: ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) { entry: %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in store <8 x i16> %ld, <8 x i16> addrspace(1)* %out @@ -65,7 +65,7 @@ entry: ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) { entry: %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in store <16 x i16> %ld, <16 x i16> addrspace(1)* %out @@ -80,7 +80,7 @@ entry: ; GCN-HSA: flat_store_dword ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 -define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { %a = load i16, i16 addrspace(2)* %in %ext = zext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -97,7 +97,7 @@ define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace( ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 16 -define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { %a = load i16, i16 addrspace(2)* %in %ext = sext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -109,7 +109,7 @@ define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace( ; GCN-HSA: flat_load_ushort ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 -define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(2)* %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -123,7 +123,7 @@ define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 16 -define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(2)* %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -140,7 +140,7 @@ define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal ; EG: 16 ; EG: 16 -define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -160,7 +160,7 @@ define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -183,7 +183,7 @@ define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal ; EG-DAG: 65535 ; EG-DAG: 65535 -define void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -204,7 +204,7 @@ entry: ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -229,7 +229,7 @@ entry: ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal ; EG-DAG: 65535 ; EG-DAG: 65535 -define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -254,7 +254,7 @@ define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -288,7 +288,7 @@ define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x ; EG-DAG: 65535 ; EG-DAG: 65535 ; EG-DAG: 65535 -define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -322,7 +322,7 @@ define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -337,7 +337,7 @@ define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x ; v16i16 is naturally 32 byte aligned ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1 -define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -352,7 +352,7 @@ define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, < ; v16i16 is naturally 32 byte aligned ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1 -define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -369,7 +369,7 @@ define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, < ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 -define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -385,7 +385,7 @@ define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, < ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 -define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -404,7 +404,7 @@ define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, < ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 -define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(2)* %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -421,7 +421,7 @@ define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, < ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 -define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(2)* %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -438,7 +438,7 @@ define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, < ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { %a = load i16, i16 addrspace(2)* %in %ext = zext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -464,7 +464,7 @@ define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace( ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EG: 31 -define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { %a = load i16, i16 addrspace(2)* %in %ext = sext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -475,7 +475,7 @@ define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace( ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(2)* %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -488,7 +488,7 @@ define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EG: 31 -define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(2)* %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -498,7 +498,7 @@ define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -508,7 +508,7 @@ define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -518,7 +518,7 @@ define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -528,7 +528,7 @@ define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -538,7 +538,7 @@ define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -548,7 +548,7 @@ define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -559,7 +559,7 @@ define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -570,7 +570,7 @@ define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, < ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -583,7 +583,7 @@ define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, < ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -596,7 +596,7 @@ define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, < ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -606,7 +606,7 @@ define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, < ; These trigger undefined register machine verifier errors ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64: -; define void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in ; %ext = zext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -614,7 +614,7 @@ define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, < ; } ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64: -; define void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in ; %ext = sext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-constant-i32.ll b/test/CodeGen/AMDGPU/load-constant-i32.ll index d1ff1c706c40..7370d45ca6b9 100644 --- a/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -7,7 +7,7 @@ ; GCN: s_load_dword s{{[0-9]+}} ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { entry: %ld = load i32, i32 addrspace(2)* %in store i32 %ld, i32 addrspace(1)* %out @@ -18,7 +18,7 @@ entry: ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 -define void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { entry: %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in store <2 x i32> %ld, <2 x i32> addrspace(1)* %out @@ -29,7 +29,7 @@ entry: ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in store <3 x i32> %ld, <3 x i32> addrspace(1)* %out @@ -40,7 +40,7 @@ entry: ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { entry: %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in store <4 x i32> %ld, <4 x i32> addrspace(1)* %out @@ -52,7 +52,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in store <8 x i32> %ld, <8 x i32> addrspace(1)* %out @@ -66,7 +66,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in store <16 x i32> %ld, <16 x i32> addrspace(1)* %out @@ -81,7 +81,7 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY ; EG: CF_END ; EG: VTX_READ_32 -define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { %ld = load i32, i32 addrspace(2)* %in %ext = zext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -98,7 +98,7 @@ define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace( ; EG: VTX_READ_32 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 -define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { %ld = load i32, i32 addrspace(2)* %in %ext = sext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -108,7 +108,7 @@ define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace( ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64: ; GCN: s_load_dword ; GCN: store_dwordx2 -define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -119,7 +119,7 @@ define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x ; GCN: s_load_dword s[[LO:[0-9]+]] ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31 ; GCN: store_dwordx2 -define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -129,7 +129,7 @@ define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64: ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: store_dwordx4 -define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -143,7 +143,7 @@ define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x ; GCN-DAG: s_ashr_i32 ; GCN: store_dwordx4 -define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -155,7 +155,7 @@ define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -172,7 +172,7 @@ define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -191,7 +191,7 @@ define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-SA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -219,7 +219,7 @@ define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -240,7 +240,7 @@ define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x ; GCN: store_dwordx4 ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -267,7 +267,7 @@ define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, < ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -319,7 +319,7 @@ define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, < ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -370,7 +370,7 @@ define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, < ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-constant-i64.ll b/test/CodeGen/AMDGPU/load-constant-i64.ll index 0d071a10b49a..14e50ea4c3ca 100644 --- a/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -7,7 +7,7 @@ ; FUNC-LABEL: {{^}}constant_load_i64: ; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; EG: VTX_READ_64 -define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 { %ld = load i64, i64 addrspace(2)* %in store i64 %ld, i64 addrspace(1)* %out ret void @@ -17,7 +17,7 @@ define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 { entry: %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in store <2 x i64> %ld, <2 x i64> addrspace(1)* %out @@ -29,7 +29,7 @@ entry: ; EG-DAG: VTX_READ_128 ; EG-DAG: VTX_READ_128 -define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in store <3 x i64> %ld, <3 x i64> addrspace(1)* %out @@ -41,7 +41,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 { entry: %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in store <4 x i64> %ld, <4 x i64> addrspace(1)* %out @@ -55,7 +55,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 { entry: %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in store <8 x i64> %ld, <8 x i64> addrspace(1)* %out @@ -74,7 +74,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 { entry: %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in store <16 x i64> %ld, <16 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-constant-i8.ll b/test/CodeGen/AMDGPU/load-constant-i8.ll index 9fdc4ebfd854..6e56b9f9b6d6 100644 --- a/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -10,7 +10,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; TODO: NOT AND -define void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { entry: %ld = load i8, i8 addrspace(2)* %in store i8 %ld, i8 addrspace(1)* %out @@ -22,7 +22,7 @@ entry: ; GCN-HSA: flat_load_ushort v ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { entry: %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in store <2 x i8> %ld, <2 x i8> addrspace(1)* %out @@ -33,7 +33,7 @@ entry: ; GCN: s_load_dword s ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in store <3 x i8> %ld, <3 x i8> addrspace(1)* %out @@ -44,7 +44,7 @@ entry: ; GCN: s_load_dword s ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { entry: %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in store <4 x i8> %ld, <4 x i8> addrspace(1)* %out @@ -55,7 +55,7 @@ entry: ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { entry: %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in store <8 x i8> %ld, <8 x i8> addrspace(1)* %out @@ -66,7 +66,7 @@ entry: ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { entry: %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in store <16 x i8> %ld, <16 x i8> addrspace(1)* %out @@ -78,7 +78,7 @@ entry: ; GCN-HSA: flat_load_ubyte ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = zext i8 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -92,7 +92,7 @@ define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2) ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %ld = load i8, i8 addrspace(2)* %in %ext = sext i8 %ld to i32 store i32 %ext, i32 addrspace(1)* %out @@ -102,7 +102,7 @@ define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2) ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32: ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = zext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -114,7 +114,7 @@ define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = sext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -129,7 +129,7 @@ define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x ; TODO: This should use DST, but for some there are redundant MOVs ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal ; EG: 8 -define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = zext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -150,7 +150,7 @@ define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = sext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -170,7 +170,7 @@ define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in %ext = zext <3 x i8> %ld to <3 x i32> @@ -193,7 +193,7 @@ entry: ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in %ext = sext <3 x i8> %ld to <3 x i32> @@ -214,7 +214,7 @@ entry: ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = zext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -236,7 +236,7 @@ define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = sext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -264,7 +264,7 @@ define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = zext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -294,7 +294,7 @@ define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = sext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -335,7 +335,7 @@ define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = zext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -378,7 +378,7 @@ define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <1 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = sext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -450,7 +450,7 @@ define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <1 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = zext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -526,7 +526,7 @@ define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <3 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = sext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -539,7 +539,7 @@ define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <3 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(2)* %in %ext = zext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -552,7 +552,7 @@ define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <6 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(2)* %in %ext = sext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -570,7 +570,7 @@ define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <6 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = zext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -589,7 +589,7 @@ define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2) ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = sext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -600,7 +600,7 @@ define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2) ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = zext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -613,7 +613,7 @@ define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = sext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -623,7 +623,7 @@ define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = zext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -633,7 +633,7 @@ define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = sext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -643,7 +643,7 @@ define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = zext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -653,7 +653,7 @@ define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = sext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -663,7 +663,7 @@ define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = zext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -673,7 +673,7 @@ define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = sext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -683,7 +683,7 @@ define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = zext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -693,7 +693,7 @@ define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <1 ; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = sext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -704,7 +704,7 @@ define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = zext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -715,7 +715,7 @@ define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = sext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -723,7 +723,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3 } ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64: -; define void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in ; %ext = zext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -731,7 +731,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3 ; } ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64: -; define void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in ; %ext = sext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -744,7 +744,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3 ; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]], ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]] -define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = zext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out @@ -759,7 +759,7 @@ define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2) ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]] ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { %a = load i8, i8 addrspace(2)* %in %ext = sext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out @@ -767,7 +767,7 @@ define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2) } ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16: -define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = zext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out @@ -778,7 +778,7 @@ define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(2)* %in %ext = sext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out @@ -788,7 +788,7 @@ define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = zext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out @@ -800,7 +800,7 @@ define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(2)* %in %ext = sext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out @@ -810,7 +810,7 @@ define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = zext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out @@ -824,7 +824,7 @@ define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(2)* %in %ext = sext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out @@ -834,7 +834,7 @@ define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = zext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out @@ -853,7 +853,7 @@ define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = sext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out @@ -863,7 +863,7 @@ define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = zext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out @@ -889,7 +889,7 @@ define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(2)* %in %ext = sext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out @@ -900,7 +900,7 @@ define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = zext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out @@ -943,7 +943,7 @@ define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(2)* %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out @@ -951,7 +951,7 @@ define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3 } ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16: -; define void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in ; %ext = zext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out @@ -959,7 +959,7 @@ define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3 ; } ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16: -; define void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { +; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in ; %ext = sext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll index 805c0a7a39c7..bd6fea587b42 100644 --- a/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/test/CodeGen/AMDGPU/load-global-f32.ll @@ -10,7 +10,7 @@ ; GCN-HSA: flat_load_dword ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %tmp0 = load float, float addrspace(1)* %in store float %tmp0, float addrspace(1)* %out @@ -22,7 +22,7 @@ entry: ; GCN-HSA: flat_load_dwordx2 ; R600: VTX_READ_64 -define void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in store <2 x float> %tmp0, <2 x float> addrspace(1)* %out @@ -34,7 +34,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; R600: VTX_READ_128 -define void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in store <3 x float> %tmp0, <3 x float> addrspace(1)* %out @@ -46,7 +46,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; R600: VTX_READ_128 -define void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in store <4 x float> %tmp0, <4 x float> addrspace(1)* %out @@ -61,7 +61,7 @@ entry: ; R600: VTX_READ_128 ; R600: VTX_READ_128 -define void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in store <8 x float> %tmp0, <8 x float> addrspace(1)* %out @@ -83,7 +83,7 @@ entry: ; R600: VTX_READ_128 ; R600: VTX_READ_128 ; R600: VTX_READ_128 -define void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { entry: %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in store <16 x float> %tmp0, <16 x float> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll index dc1a9432283e..5b772e1fe5ee 100644 --- a/test/CodeGen/AMDGPU/load-global-f64.ll +++ b/test/CodeGen/AMDGPU/load-global-f64.ll @@ -8,7 +8,7 @@ ; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]] -define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %ld = load double, double addrspace(1)* %in store double %ld, double addrspace(1)* %out ret void @@ -17,7 +17,7 @@ define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in ; FUNC-LABEL: {{^}}global_load_v2f64: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { entry: %ld = load <2 x double>, <2 x double> addrspace(1)* %in store <2 x double> %ld, <2 x double> addrspace(1)* %out @@ -29,7 +29,7 @@ entry: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { entry: %ld = load <3 x double>, <3 x double> addrspace(1)* %in store <3 x double> %ld, <3 x double> addrspace(1)* %out @@ -42,7 +42,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { entry: %ld = load <4 x double>, <4 x double> addrspace(1)* %in store <4 x double> %ld, <4 x double> addrspace(1)* %out @@ -59,7 +59,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { entry: %ld = load <8 x double>, <8 x double> addrspace(1)* %in store <8 x double> %ld, <8 x double> addrspace(1)* %out @@ -84,7 +84,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { entry: %ld = load <16 x double>, <16 x double> addrspace(1)* %in store <16 x double> %ld, <16 x double> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-global-i1.ll b/test/CodeGen/AMDGPU/load-global-i1.ll index e2e90cac8cc1..cb3536a0c128 100644 --- a/test/CodeGen/AMDGPU/load-global-i1.ll +++ b/test/CodeGen/AMDGPU/load-global-i1.ll @@ -9,56 +9,56 @@ ; EG: VTX_READ_8 ; EG: AND_INT -define void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %load = load i1, i1 addrspace(1)* %in store i1 %load, i1 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v2i1: -define void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in store <2 x i1> %load, <2 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v3i1: -define void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in store <3 x i1> %load, <3 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v4i1: -define void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in store <4 x i1> %load, <4 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v8i1: -define void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in store <8 x i1> %load, <8 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v16i1: -define void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in store <16 x i1> %load, <16 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v32i1: -define void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in store <32 x i1> %load, <32 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v64i1: -define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in store <64 x i1> %load, <64 x i1> addrspace(1)* %out ret void @@ -67,7 +67,7 @@ define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace ; FUNC-LABEL: {{^}}global_zextload_i1_to_i32: ; GCN: buffer_load_ubyte ; GCN: buffer_store_dword -define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %a = load i1, i1 addrspace(1)* %in %ext = zext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -81,7 +81,7 @@ define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* ; EG: VTX_READ_8 ; EG: BFE_INT -define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %a = load i1, i1 addrspace(1)* %in %ext = sext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -89,7 +89,7 @@ define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i32: -define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(1)* %in %ext = zext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -97,7 +97,7 @@ define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i32: -define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(1)* %in %ext = sext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -105,7 +105,7 @@ define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i32: -define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in %ext = zext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -113,7 +113,7 @@ define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i32: -define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in %ext = sext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -121,7 +121,7 @@ define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i32: -define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in %ext = zext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out @@ -129,7 +129,7 @@ define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i32: -define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in %ext = sext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out @@ -137,7 +137,7 @@ define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i32: -define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in %ext = zext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -145,7 +145,7 @@ define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i32: -define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in %ext = sext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -153,7 +153,7 @@ define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i32: -define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in %ext = zext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -161,7 +161,7 @@ define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i32: -define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in %ext = sext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -169,7 +169,7 @@ define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i32: -define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in %ext = zext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -177,7 +177,7 @@ define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 } ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i32: -define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in %ext = sext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -185,7 +185,7 @@ define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 } ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i32: -define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in %ext = zext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -193,7 +193,7 @@ define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 } ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i32: -define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in %ext = sext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -201,7 +201,7 @@ define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 } ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i32: -define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in %ext = zext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -209,7 +209,7 @@ define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 } ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i32: -define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in %ext = sext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -221,7 +221,7 @@ define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]{{$}} ; GCN: buffer_store_dwordx2 -define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %a = load i1, i1 addrspace(1)* %in %ext = zext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -233,7 +233,7 @@ define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] ; GCN: buffer_store_dwordx2 -define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %a = load i1, i1 addrspace(1)* %in %ext = sext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -241,7 +241,7 @@ define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i64: -define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(1)* %in %ext = zext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -249,7 +249,7 @@ define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i64: -define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(1)* %in %ext = sext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -257,7 +257,7 @@ define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i64: -define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in %ext = zext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -265,7 +265,7 @@ define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i64: -define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(1)* %in %ext = sext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -273,7 +273,7 @@ define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i64: -define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in %ext = zext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out @@ -281,7 +281,7 @@ define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i64: -define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(1)* %in %ext = sext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out @@ -289,7 +289,7 @@ define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i64: -define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in %ext = zext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -297,7 +297,7 @@ define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i64: -define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(1)* %in %ext = sext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -305,7 +305,7 @@ define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i64: -define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in %ext = zext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -313,7 +313,7 @@ define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1 } ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i64: -define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(1)* %in %ext = sext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -321,7 +321,7 @@ define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1 } ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i64: -define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in %ext = zext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -329,7 +329,7 @@ define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 } ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i64: -define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(1)* %in %ext = sext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -337,7 +337,7 @@ define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 } ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i64: -define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in %ext = zext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -345,7 +345,7 @@ define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 } ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i64: -define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(1)* %in %ext = sext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -353,7 +353,7 @@ define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 } ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i64: -define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in %ext = zext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -361,7 +361,7 @@ define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 } ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i64: -define void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(1)* %in %ext = sext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll index 88d6b7b99d30..dcdd1a947cd4 100644 --- a/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/test/CodeGen/AMDGPU/load-global-i16.ll @@ -11,7 +11,7 @@ ; GCN-HSA: flat_load_ushort ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { entry: %ld = load i16, i16 addrspace(1)* %in store i16 %ld, i16 addrspace(1)* %out @@ -23,7 +23,7 @@ entry: ; GCN-HSA: flat_load_dword v ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in store <2 x i16> %ld, <2 x i16> addrspace(1)* %out @@ -36,7 +36,7 @@ entry: ; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 -define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in store <3 x i16> %ld, <3 x i16> addrspace(1)* %out @@ -48,7 +48,7 @@ entry: ; GCN-HSA: flat_load_dwordx2 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { entry: %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in store <4 x i16> %ld, <4 x i16> addrspace(1)* %out @@ -60,7 +60,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) { entry: %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in store <8 x i16> %ld, <8 x i16> addrspace(1)* %out @@ -76,7 +76,7 @@ entry: ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) { entry: %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in store <16 x i16> %ld, <16 x i16> addrspace(1)* %out @@ -91,7 +91,7 @@ entry: ; GCN-HSA: flat_store_dword ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -108,7 +108,7 @@ define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EGCM: 16 -define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -120,7 +120,7 @@ define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; GCN-HSA: flat_load_ushort ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -134,7 +134,7 @@ define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EGCM: 16 -define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -148,7 +148,7 @@ define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i ; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal ; EGCM: 16 -define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -168,7 +168,7 @@ define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -190,7 +190,7 @@ define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i ; EGCM: 16 ; EGCM: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal ; EGCM: AND_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], literal -define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -214,7 +214,7 @@ entry: ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -237,7 +237,7 @@ entry: ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal ; EGCM-DAG: 16 -define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -262,7 +262,7 @@ define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i ; EGCM-DAG: 16 ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -296,7 +296,7 @@ define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i ; EGCM-DAG: 16 ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -330,7 +330,7 @@ define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i ; EGCM-DAG: 16 ; EGCM-DAG: 16 ; EGCM-DAG: 16 -define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -346,7 +346,7 @@ define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 -define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -357,7 +357,7 @@ define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 -define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -379,7 +379,7 @@ define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 -define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -401,7 +401,7 @@ define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 -define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -435,7 +435,7 @@ define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 -define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -452,7 +452,7 @@ define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 -define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -469,7 +469,7 @@ define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EGCM: MOV {{.*}}, 0.0 -define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = zext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -495,7 +495,7 @@ define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EGCM: 31 -define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { %a = load i16, i16 addrspace(1)* %in %ext = sext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -506,7 +506,7 @@ define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EGCM: MOV {{.*}}, 0.0 -define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -519,7 +519,7 @@ define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EGCM: 31 -define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(1)* %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -527,7 +527,7 @@ define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i } ; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i64: -define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -537,7 +537,7 @@ define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64: ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -547,7 +547,7 @@ define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64: ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -557,7 +557,7 @@ define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64: ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -567,7 +567,7 @@ define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64: ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -577,7 +577,7 @@ define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64: ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -588,7 +588,7 @@ define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -599,7 +599,7 @@ define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -612,7 +612,7 @@ define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -625,7 +625,7 @@ define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -633,7 +633,7 @@ define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 } ; ; XFUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i64: -; define void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in ; %ext = zext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -641,7 +641,7 @@ define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; } ; ; XFUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i64: -; define void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in ; %ext = sext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll index e3335347a63f..5df32c1e3120 100644 --- a/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/test/CodeGen/AMDGPU/load-global-i32.ll @@ -9,7 +9,7 @@ ; GCN-HSA: flat_load_dword ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: %ld = load i32, i32 addrspace(1)* %in store i32 %ld, i32 addrspace(1)* %out @@ -21,7 +21,7 @@ entry: ; GCN-HSA: flat_load_dwordx2 ; EG: VTX_READ_64 -define void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { entry: %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in store <2 x i32> %ld, <2 x i32> addrspace(1)* %out @@ -33,7 +33,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; EG: VTX_READ_128 -define void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in store <3 x i32> %ld, <3 x i32> addrspace(1)* %out @@ -45,7 +45,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; EG: VTX_READ_128 -define void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { entry: %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in store <4 x i32> %ld, <4 x i32> addrspace(1)* %out @@ -60,7 +60,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in store <8 x i32> %ld, <8 x i32> addrspace(1)* %out @@ -82,7 +82,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in store <16 x i32> %ld, <16 x i32> addrspace(1)* %out @@ -98,7 +98,7 @@ entry: ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]] ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY -define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %ld = load i32, i32 addrspace(1)* %in %ext = zext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -117,7 +117,7 @@ define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1) ; EG: VTX_READ_32 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 -define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %ld = load i32, i32 addrspace(1)* %in %ext = sext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -130,7 +130,7 @@ define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1) ; GCN-HSA: flat_load_dword ; GCN-HSA: flat_store_dwordx2 -define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -143,7 +143,7 @@ define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -156,7 +156,7 @@ define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i ; GCN-HSA: flat_load_dwordx2 ; GCN-HSA: flat_store_dwordx4 -define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -172,7 +172,7 @@ define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -187,7 +187,7 @@ define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -208,7 +208,7 @@ define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -231,7 +231,7 @@ define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-SA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -263,7 +263,7 @@ define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -309,7 +309,7 @@ define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; GCN-DAG: v_ashrrev_i32 ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -344,7 +344,7 @@ define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -444,7 +444,7 @@ define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -511,7 +511,7 @@ define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll index dd4ce2c10ebd..de16b6c8997e 100644 --- a/test/CodeGen/AMDGPU/load-global-i64.ll +++ b/test/CodeGen/AMDGPU/load-global-i64.ll @@ -13,7 +13,7 @@ ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]] ; EG: VTX_READ_64 -define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %ld = load i64, i64 addrspace(1)* %in store i64 %ld, i64 addrspace(1)* %out ret void @@ -24,7 +24,7 @@ define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { ; GCN-HSA: flat_load_dwordx4 ; EG: VTX_READ_128 -define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 { entry: %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in store <2 x i64> %ld, <2 x i64> addrspace(1)* %out @@ -40,7 +40,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in store <3 x i64> %ld, <3 x i64> addrspace(1)* %out @@ -56,7 +56,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { entry: %ld = load <4 x i64>, <4 x i64> addrspace(1)* %in store <4 x i64> %ld, <4 x i64> addrspace(1)* %out @@ -78,7 +78,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 { entry: %ld = load <8 x i64>, <8 x i64> addrspace(1)* %in store <8 x i64> %ld, <8 x i64> addrspace(1)* %out @@ -112,7 +112,7 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 { entry: %ld = load <16 x i64>, <16 x i64> addrspace(1)* %in store <16 x i64> %ld, <16 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll index c880700f347b..71adf090532f 100644 --- a/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/test/CodeGen/AMDGPU/load-global-i8.ll @@ -11,7 +11,7 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; TODO: NOT AND -define void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { entry: %ld = load i8, i8 addrspace(1)* %in store i8 %ld, i8 addrspace(1)* %out @@ -23,7 +23,7 @@ entry: ; GCN-HSA: flat_load_ushort v ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { entry: %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in store <2 x i8> %ld, <2 x i8> addrspace(1)* %out @@ -35,7 +35,7 @@ entry: ; GCN-HSA: flat_load_dword v ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in store <3 x i8> %ld, <3 x i8> addrspace(1)* %out @@ -47,7 +47,7 @@ entry: ; GCN-HSA: flat_load_dword v ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { entry: %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in store <4 x i8> %ld, <4 x i8> addrspace(1)* %out @@ -59,7 +59,7 @@ entry: ; GCN-HSA: flat_load_dwordx2 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { entry: %ld = load <8 x i8>, <8 x i8> addrspace(1)* %in store <8 x i8> %ld, <8 x i8> addrspace(1)* %out @@ -72,7 +72,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { entry: %ld = load <16 x i8>, <16 x i8> addrspace(1)* %in store <16 x i8> %ld, <16 x i8> addrspace(1)* %out @@ -84,7 +84,7 @@ entry: ; GCN-HSA: flat_load_ubyte ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = zext i8 %a to i32 store i32 %ext, i32 addrspace(1)* %out @@ -98,7 +98,7 @@ define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %ld = load i8, i8 addrspace(1)* %in %ext = sext i8 %ld to i32 store i32 %ext, i32 addrspace(1)* %out @@ -108,7 +108,7 @@ define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i32: ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = zext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -120,7 +120,7 @@ define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = sext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out @@ -135,7 +135,7 @@ define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8 ; TODO: These should use DST, but for some there are redundant MOVs ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal ; EG-DAG: 8 -define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = zext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -152,7 +152,7 @@ define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = sext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out @@ -174,7 +174,7 @@ define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in %ext = zext <3 x i8> %ld to <3 x i32> @@ -207,7 +207,7 @@ entry: ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in %ext = sext <3 x i8> %ld to <3 x i32> @@ -227,7 +227,7 @@ entry: ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = zext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -248,7 +248,7 @@ define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = sext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out @@ -273,7 +273,7 @@ define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = zext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -300,7 +300,7 @@ define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = sext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out @@ -341,7 +341,7 @@ define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = zext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -384,7 +384,7 @@ define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = sext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out @@ -456,7 +456,7 @@ define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = zext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -532,7 +532,7 @@ define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = sext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out @@ -545,7 +545,7 @@ define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(1)* %in %ext = zext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -558,7 +558,7 @@ define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(1)* %in %ext = sext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out @@ -576,7 +576,7 @@ define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = zext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -595,7 +595,7 @@ define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = sext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out @@ -606,7 +606,7 @@ define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = zext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -619,7 +619,7 @@ define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = sext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -629,7 +629,7 @@ define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = zext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -639,7 +639,7 @@ define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8 ; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = sext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -649,7 +649,7 @@ define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = zext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -659,7 +659,7 @@ define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8 ; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = sext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -669,7 +669,7 @@ define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = zext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -679,7 +679,7 @@ define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8 ; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = sext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -689,7 +689,7 @@ define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = zext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -699,7 +699,7 @@ define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = sext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -710,7 +710,7 @@ define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = zext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -721,7 +721,7 @@ define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = sext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -729,7 +729,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 } ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64: -; define void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in ; %ext = zext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -737,7 +737,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; } ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64: -; define void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in ; %ext = sext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out @@ -752,7 +752,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]] ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = zext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out @@ -768,7 +768,7 @@ define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %a = load i8, i8 addrspace(1)* %in %ext = sext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out @@ -778,7 +778,7 @@ define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i16: ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = zext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out @@ -789,7 +789,7 @@ define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(1)* %in %ext = sext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out @@ -799,7 +799,7 @@ define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i16: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = zext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out @@ -811,7 +811,7 @@ define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in %ext = sext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out @@ -821,7 +821,7 @@ define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i16: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = zext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out @@ -835,7 +835,7 @@ define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in %ext = sext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out @@ -845,7 +845,7 @@ define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i16: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = zext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out @@ -863,7 +863,7 @@ define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in %ext = sext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out @@ -873,7 +873,7 @@ define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i16: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = zext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out @@ -899,7 +899,7 @@ define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(1)* %in %ext = sext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out @@ -910,7 +910,7 @@ define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = zext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out @@ -953,7 +953,7 @@ define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(1)* %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out @@ -961,7 +961,7 @@ define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 } ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16: -; define void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in ; %ext = zext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out @@ -969,7 +969,7 @@ define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 ; } ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16: -; define void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { +; define amdgpu_kernel void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in ; %ext = sext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/load-input-fold.ll b/test/CodeGen/AMDGPU/load-input-fold.ll index b1899a45bf56..0724e09d7ad0 100644 --- a/test/CodeGen/AMDGPU/load-input-fold.ll +++ b/test/CodeGen/AMDGPU/load-input-fold.ll @@ -97,15 +97,6 @@ main_body: ; Function Attrs: readnone declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1 -; Function Attrs: readonly -declare float @fabs(float) #2 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1 - ; Function Attrs: nounwind readonly declare float @llvm.pow.f32(float, float) #3 diff --git a/test/CodeGen/AMDGPU/load-local-f32.ll b/test/CodeGen/AMDGPU/load-local-f32.ll index 77b5e3cf3aed..09d7145424de 100644 --- a/test/CodeGen/AMDGPU/load-local-f32.ll +++ b/test/CodeGen/AMDGPU/load-local-f32.ll @@ -7,7 +7,7 @@ ; GCN: ds_read_b32 ; EG: LDS_READ_RET -define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 { +define amdgpu_kernel void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 { entry: %tmp0 = load float, float addrspace(3)* %in store float %tmp0, float addrspace(1)* %out @@ -20,7 +20,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <2 x float>, <2 x float> addrspace(3)* %in store <2 x float> %tmp0, <2 x float> addrspace(1)* %out @@ -38,7 +38,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in store <3 x float> %tmp0, <3 x float> addrspace(3)* %out @@ -52,7 +52,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <4 x float>, <4 x float> addrspace(3)* %in store <4 x float> %tmp0, <4 x float> addrspace(3)* %out @@ -71,7 +71,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <8 x float>, <8 x float> addrspace(3)* %in store <8 x float> %tmp0, <8 x float> addrspace(3)* %out @@ -100,7 +100,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 { entry: %tmp0 = load <16 x float>, <16 x float> addrspace(3)* %in store <16 x float> %tmp0, <16 x float> addrspace(3)* %out diff --git a/test/CodeGen/AMDGPU/load-local-f64.ll b/test/CodeGen/AMDGPU/load-local-f64.ll index 27d39b7e9d7d..9ad6c087bf2e 100644 --- a/test/CodeGen/AMDGPU/load-local-f64.ll +++ b/test/CodeGen/AMDGPU/load-local-f64.ll @@ -9,7 +9,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 { %ld = load double, double addrspace(3)* %in store double %ld, double addrspace(3)* %out ret void @@ -22,7 +22,7 @@ define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 { entry: %ld = load <2 x double>, <2 x double> addrspace(3)* %in store <2 x double> %ld, <2 x double> addrspace(3)* %out @@ -39,7 +39,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 { entry: %ld = load <3 x double>, <3 x double> addrspace(3)* %in store <3 x double> %ld, <3 x double> addrspace(3)* %out @@ -59,7 +59,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 { entry: %ld = load <4 x double>, <4 x double> addrspace(3)* %in store <4 x double> %ld, <4 x double> addrspace(3)* %out @@ -88,7 +88,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 { entry: %ld = load <8 x double>, <8 x double> addrspace(3)* %in store <8 x double> %ld, <8 x double> addrspace(3)* %out @@ -144,7 +144,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 { entry: %ld = load <16 x double>, <16 x double> addrspace(3)* %in store <16 x double> %ld, <16 x double> addrspace(3)* %out diff --git a/test/CodeGen/AMDGPU/load-local-i1.ll b/test/CodeGen/AMDGPU/load-local-i1.ll index 2eed9917b5e5..e8f134b1fb2e 100644 --- a/test/CodeGen/AMDGPU/load-local-i1.ll +++ b/test/CodeGen/AMDGPU/load-local-i1.ll @@ -10,56 +10,56 @@ ; EG: LDS_UBYTE_READ_RET ; EG: AND_INT ; EG: LDS_BYTE_WRITE -define void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %load = load i1, i1 addrspace(3)* %in store i1 %load, i1 addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v2i1: -define void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in store <2 x i1> %load, <2 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v3i1: -define void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in store <3 x i1> %load, <3 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v4i1: -define void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in store <4 x i1> %load, <4 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v8i1: -define void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in store <8 x i1> %load, <8 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v16i1: -define void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in store <16 x i1> %load, <16 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v32i1: -define void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in store <32 x i1> %load, <32 x i1> addrspace(3)* %out ret void } ; FUNC-LABEL: {{^}}local_load_v64i1: -define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in store <64 x i1> %load, <64 x i1> addrspace(3)* %out ret void @@ -68,7 +68,7 @@ define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace( ; FUNC-LABEL: {{^}}local_zextload_i1_to_i32: ; GCN: ds_read_u8 ; GCN: ds_write_b32 -define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %a = load i1, i1 addrspace(3)* %in %ext = zext i1 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -82,7 +82,7 @@ define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* % ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT -define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %a = load i1, i1 addrspace(3)* %in %ext = sext i1 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -90,7 +90,7 @@ define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* % } ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32: -define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = zext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -98,7 +98,7 @@ define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32: -define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = sext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -106,7 +106,7 @@ define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32: -define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = zext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -114,7 +114,7 @@ define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32: -define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = sext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -122,7 +122,7 @@ define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32: -define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = zext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(3)* %out @@ -130,7 +130,7 @@ define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32: -define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = sext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(3)* %out @@ -138,7 +138,7 @@ define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32: -define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = zext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -146,7 +146,7 @@ define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32: -define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = sext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -154,7 +154,7 @@ define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32: -define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = zext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -162,7 +162,7 @@ define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32: -define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = sext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -170,7 +170,7 @@ define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32: -define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = zext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -178,7 +178,7 @@ define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x } ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32: -define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = sext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -186,7 +186,7 @@ define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x } ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32: -define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = zext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -194,7 +194,7 @@ define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x } ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32: -define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = sext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -202,7 +202,7 @@ define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x } ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32: -define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = zext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -210,7 +210,7 @@ define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x } ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32: -define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = sext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -221,7 +221,7 @@ define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x ; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]], ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN: ds_write_b64 -define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %a = load i1, i1 addrspace(3)* %in %ext = zext i1 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -233,7 +233,7 @@ define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* % ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] ; GCN: ds_write_b64 -define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { %a = load i1, i1 addrspace(3)* %in %ext = sext i1 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -241,7 +241,7 @@ define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* % } ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64: -define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = zext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -249,7 +249,7 @@ define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64: -define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = sext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -257,7 +257,7 @@ define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64: -define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = zext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -265,7 +265,7 @@ define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64: -define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = sext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -273,7 +273,7 @@ define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64: -define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = zext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(3)* %out @@ -281,7 +281,7 @@ define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64: -define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = sext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(3)* %out @@ -289,7 +289,7 @@ define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64: -define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = zext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -297,7 +297,7 @@ define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64: -define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = sext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -305,7 +305,7 @@ define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64: -define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = zext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -313,7 +313,7 @@ define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> } ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64: -define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = sext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -321,7 +321,7 @@ define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> } ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64: -define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = zext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -329,7 +329,7 @@ define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x } ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64: -define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = sext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -337,7 +337,7 @@ define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x } ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64: -define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = zext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -345,7 +345,7 @@ define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x } ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64: -define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = sext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -353,7 +353,7 @@ define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x } ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64: -define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = zext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(3)* %out @@ -361,7 +361,7 @@ define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x } ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64: -define void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = sext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(3)* %out diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll index d4e86de66aff..bbbb34e8d333 100644 --- a/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/test/CodeGen/AMDGPU/load-local-i16.ll @@ -10,7 +10,7 @@ ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) { +define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) { entry: %ld = load i16, i16 addrspace(3)* %in store i16 %ld, i16 addrspace(3)* %out @@ -25,7 +25,7 @@ entry: ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) { entry: %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in store <2 x i16> %ld, <2 x i16> addrspace(3)* %out @@ -39,7 +39,7 @@ entry: ; EG-DAG: LDS_USHORT_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in store <3 x i16> %ld, <3 x i16> addrspace(3)* %out @@ -51,7 +51,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) { entry: %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in store <4 x i16> %ld, <4 x i16> addrspace(3)* %out @@ -65,7 +65,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) { entry: %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in store <8 x i16> %ld, <8 x i16> addrspace(3)* %out @@ -86,7 +86,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) { entry: %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in store <16 x i16> %ld, <16 x i16> addrspace(3)* %out @@ -102,7 +102,7 @@ entry: ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { %a = load i16, i16 addrspace(3)* %in %ext = zext i16 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -121,7 +121,7 @@ define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal ; EG: 16 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { %a = load i16, i16 addrspace(3)* %in %ext = sext i16 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -136,7 +136,7 @@ define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(3)* %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -153,7 +153,7 @@ define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal ; EG: 16 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(3)* %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -166,7 +166,7 @@ define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1 ; GCN: ds_read_b32 ; EG: LDS_READ_RET -define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(3)* %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -181,7 +181,7 @@ define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1 ; EG: LDS_READ_RET ; EG: BFE_INT ; EG: BFE_INT -define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(3)* %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -194,7 +194,7 @@ define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1 ; GCN-DAG: ds_write_b64 ; EG: LDS_READ_RET -define void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -211,7 +211,7 @@ entry: ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { +define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in %ext = sext <3 x i16> %ld to <3 x i32> @@ -226,7 +226,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(3)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -244,7 +244,7 @@ define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, < ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(3)* %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -258,7 +258,7 @@ define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1 ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -280,7 +280,7 @@ define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1 ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -304,7 +304,7 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1 ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -340,7 +340,7 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -369,7 +369,7 @@ define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -406,7 +406,7 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -471,7 +471,7 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(3)* %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -512,7 +512,7 @@ define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(3)* %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -531,7 +531,7 @@ define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG-DAG: LDS_WRITE ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { %a = load i16, i16 addrspace(3)* %in %ext = zext i16 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -558,7 +558,7 @@ define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* ; EG-DAG: LDS_WRITE ; EG-DAG: 16 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 { %a = load i16, i16 addrspace(3)* %in %ext = sext i16 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -573,7 +573,7 @@ define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y ; EG-DAG: LDS_WRITE ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(3)* %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -590,7 +590,7 @@ define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1 ; EG-DAG: LDS_WRITE ; EG-DAG: 16 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]] -define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 { %load = load <1 x i16>, <1 x i16> addrspace(3)* %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -600,7 +600,7 @@ define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64: ; EG: LDS_READ_RET -define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(3)* %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -612,7 +612,7 @@ define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1 ; EG: LDS_READ_RET ; EG-DAG: BFE_INT ; EG-DAG: ASHR -define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(3)* %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -623,7 +623,7 @@ define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1 ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(3)* %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -638,7 +638,7 @@ define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1 ; EG-DAG: BFE_INT ; EG-DAG: ASHR ; EG-DAG: ASHR -define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(3)* %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -651,7 +651,7 @@ define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1 ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -672,7 +672,7 @@ define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1 ; EG-DAG: BFE_INT ; EG-DAG: ASHR ; EG-DAG: ASHR -define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -689,7 +689,7 @@ define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1 ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -722,7 +722,7 @@ define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 ; EG-DAG: BFE_INT ; EG-DAG: ASHR ; EG-DAG: ASHR -define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -747,7 +747,7 @@ define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -804,7 +804,7 @@ define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 ; EG-DAG: BFE_INT ; EG-DAG: ASHR ; EG-DAG: ASHR -define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -812,7 +812,7 @@ define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 } ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64: -; define void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in ; %ext = zext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out @@ -820,7 +820,7 @@ define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 ; } ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64: -; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { ; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in ; %ext = sext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out diff --git a/test/CodeGen/AMDGPU/load-local-i32.ll b/test/CodeGen/AMDGPU/load-local-i32.ll index 280f9658ef8d..86055413d2cf 100644 --- a/test/CodeGen/AMDGPU/load-local-i32.ll +++ b/test/CodeGen/AMDGPU/load-local-i32.ll @@ -9,7 +9,7 @@ ; GCN: ds_read_b32 ; EG: LDS_READ_RET -define void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { entry: %ld = load i32, i32 addrspace(3)* %in store i32 %ld, i32 addrspace(3)* %out @@ -18,7 +18,7 @@ entry: ; FUNC-LABEL: {{^}}local_load_v2i32: ; GCN: ds_read_b64 -define void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { entry: %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in store <2 x i32> %ld, <2 x i32> addrspace(3)* %out @@ -28,7 +28,7 @@ entry: ; FUNC-LABEL: {{^}}local_load_v3i32: ; GCN-DAG: ds_read_b64 ; GCN-DAG: ds_read_b32 -define void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in store <3 x i32> %ld, <3 x i32> addrspace(3)* %out @@ -38,7 +38,7 @@ entry: ; FUNC-LABEL: {{^}}local_load_v4i32: ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { entry: %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in store <4 x i32> %ld, <4 x i32> addrspace(3)* %out @@ -48,7 +48,7 @@ entry: ; FUNC-LABEL: {{^}}local_load_v8i32: ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} -define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in store <8 x i32> %ld, <8 x i32> addrspace(3)* %out @@ -64,7 +64,7 @@ entry: ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 -define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in store <16 x i32> %ld, <16 x i32> addrspace(3)* %out @@ -72,7 +72,7 @@ entry: } ; FUNC-LABEL: {{^}}local_zextload_i32_to_i64: -define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { %ld = load i32, i32 addrspace(3)* %in %ext = zext i32 %ld to i64 store i64 %ext, i64 addrspace(3)* %out @@ -80,7 +80,7 @@ define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* } ; FUNC-LABEL: {{^}}local_sextload_i32_to_i64: -define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { %ld = load i32, i32 addrspace(3)* %in %ext = sext i32 %ld to i64 store i64 %ext, i64 addrspace(3)* %out @@ -88,7 +88,7 @@ define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* } ; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64: -define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -96,7 +96,7 @@ define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i3 } ; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64: -define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -104,7 +104,7 @@ define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i3 } ; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64: -define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -112,7 +112,7 @@ define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i3 } ; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64: -define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -120,7 +120,7 @@ define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i3 } ; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64: -define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -128,7 +128,7 @@ define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i3 } ; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64: -define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -136,7 +136,7 @@ define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i3 } ; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64: -define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -144,7 +144,7 @@ define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i3 } ; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64: -define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -152,7 +152,7 @@ define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i3 } ; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64: -define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -160,7 +160,7 @@ define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 } ; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64 -define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -168,7 +168,7 @@ define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 } ; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64: -define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -176,7 +176,7 @@ define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 } ; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64: -define void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out diff --git a/test/CodeGen/AMDGPU/load-local-i64.ll b/test/CodeGen/AMDGPU/load-local-i64.ll index 180807df7b9a..0c719a9e0bf9 100644 --- a/test/CodeGen/AMDGPU/load-local-i64.ll +++ b/test/CodeGen/AMDGPU/load-local-i64.ll @@ -9,7 +9,7 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 { %ld = load i64, i64 addrspace(3)* %in store i64 %ld, i64 addrspace(3)* %out ret void @@ -22,7 +22,7 @@ define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 { ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 { entry: %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in store <2 x i64> %ld, <2 x i64> addrspace(3)* %out @@ -39,7 +39,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in store <3 x i64> %ld, <3 x i64> addrspace(3)* %out @@ -59,7 +59,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 { entry: %ld = load <4 x i64>, <4 x i64> addrspace(3)* %in store <4 x i64> %ld, <4 x i64> addrspace(3)* %out @@ -88,7 +88,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 { entry: %ld = load <8 x i64>, <8 x i64> addrspace(3)* %in store <8 x i64> %ld, <8 x i64> addrspace(3)* %out @@ -144,7 +144,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 { entry: %ld = load <16 x i64>, <16 x i64> addrspace(3)* %in store <16 x i64> %ld, <16 x i64> addrspace(3)* %out diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll index 9ffc74213dd5..731996ec6c45 100644 --- a/test/CodeGen/AMDGPU/load-local-i8.ll +++ b/test/CodeGen/AMDGPU/load-local-i8.ll @@ -9,7 +9,7 @@ ; GCN: ds_read_u8 ; EG: LDS_UBYTE_READ_RET -define void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { entry: %ld = load i8, i8 addrspace(3)* %in store i8 %ld, i8 addrspace(3)* %out @@ -22,7 +22,7 @@ entry: ; GCN: ds_read_u16 ; EG: LDS_USHORT_READ_RET -define void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { entry: %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in store <2 x i8> %ld, <2 x i8> addrspace(3)* %out @@ -33,7 +33,7 @@ entry: ; GCN: ds_read_b32 ; EG: DS_READ_RET -define void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in store <3 x i8> %ld, <3 x i8> addrspace(3)* %out @@ -44,7 +44,7 @@ entry: ; GCN: ds_read_b32 ; EG: LDS_READ_RET -define void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { entry: %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in store <4 x i8> %ld, <4 x i8> addrspace(3)* %out @@ -56,7 +56,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { entry: %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in store <8 x i8> %ld, <8 x i8> addrspace(3)* %out @@ -71,7 +71,7 @@ entry: ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { entry: %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in store <16 x i8> %ld, <16 x i8> addrspace(3)* %out @@ -84,7 +84,7 @@ entry: ; GCN: ds_read_u8 ; EG: LDS_UBYTE_READ_RET -define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = zext i8 %a to i32 store i32 %ext, i32 addrspace(3)* %out @@ -98,7 +98,7 @@ define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* % ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT -define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %ld = load i8, i8 addrspace(3)* %in %ext = sext i8 %ld to i32 store i32 %ext, i32 addrspace(3)* %out @@ -108,7 +108,7 @@ define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* % ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32: ; EG: LDS_UBYTE_READ_RET -define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = zext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -119,7 +119,7 @@ define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT -define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = sext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(3)* %out @@ -130,7 +130,7 @@ define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> ; GCN: ds_read_u16 ; EG: LDS_USHORT_READ_RET -define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = zext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -156,7 +156,7 @@ define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> ; EG: LDS_USHORT_READ_RET ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = sext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(3)* %out @@ -172,7 +172,7 @@ define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, ; EG: LDS_READ_RET -define void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in %ext = zext <3 x i8> %ld to <3 x i32> @@ -197,7 +197,7 @@ entry: ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { entry: %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in %ext = sext <3 x i8> %ld to <3 x i32> @@ -214,7 +214,7 @@ entry: ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT -define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = zext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -231,7 +231,7 @@ define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = sext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(3)* %out @@ -248,7 +248,7 @@ define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT -define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = zext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -267,7 +267,7 @@ define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = sext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(3)* %out @@ -292,7 +292,7 @@ define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT ; EG-DAG: BFE_UINT -define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = zext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -321,7 +321,7 @@ define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = sext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(3)* %out @@ -338,7 +338,7 @@ define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = zext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -355,7 +355,7 @@ define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = sext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(3)* %out @@ -380,7 +380,7 @@ define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(3)* %in %ext = zext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -405,7 +405,7 @@ define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET -define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { %load = load <64 x i8>, <64 x i8> addrspace(3)* %in %ext = sext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(3)* %out @@ -420,7 +420,7 @@ define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x ; EG: LDS_UBYTE_READ_RET ; EG: MOV {{.*}}, literal ; EG: 0.0 -define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = zext i8 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -437,7 +437,7 @@ define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* % ; EG: ASHR ; TODO: why not 7? ; EG: 31 -define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = sext i8 %a to i64 store i64 %ext, i64 addrspace(3)* %out @@ -450,7 +450,7 @@ define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* % ; EG: MOV {{.*}}, literal ; TODO: merge? ; EG: 0.0 -define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = zext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -463,7 +463,7 @@ define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> ; EG: ASHR ; TODO: why not 7? ; EG: 31 -define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = sext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(3)* %out @@ -473,7 +473,7 @@ define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64: ; EG: LDS_USHORT_READ_RET -define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = zext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -485,7 +485,7 @@ define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> ; EG: LDS_USHORT_READ_RET ; EG: BFE_INT ; EG: BFE_INT -define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = sext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(3)* %out @@ -495,7 +495,7 @@ define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64: ; EG: LDS_READ_RET -define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = zext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -505,7 +505,7 @@ define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64: ; EG: LDS_READ_RET -define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = sext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(3)* %out @@ -516,7 +516,7 @@ define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = zext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -536,7 +536,7 @@ define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = sext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(3)* %out @@ -549,7 +549,7 @@ define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = zext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -562,7 +562,7 @@ define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = sext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(3)* %out @@ -579,7 +579,7 @@ define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = zext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -596,7 +596,7 @@ define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET -define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = sext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(3)* %out @@ -604,7 +604,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x } ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64: -; define void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in ; %ext = zext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out @@ -612,7 +612,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x ; } ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64: -; define void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in ; %ext = sext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out @@ -625,7 +625,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x ; EG: LDS_UBYTE_READ_RET ; EG: LDS_SHORT_WRITE -define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = zext i8 %a to i16 store i16 %ext, i16 addrspace(3)* %out @@ -639,7 +639,7 @@ define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* % ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT ; EG: LDS_SHORT_WRITE -define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { %a = load i8, i8 addrspace(3)* %in %ext = sext i8 %a to i16 store i16 %ext, i16 addrspace(3)* %out @@ -650,7 +650,7 @@ define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* % ; EG: LDS_UBYTE_READ_RET ; EG: LDS_SHORT_WRITE -define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = zext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(3)* %out @@ -662,7 +662,7 @@ define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT ; EG: LDS_SHORT_WRITE -define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { %load = load <1 x i8>, <1 x i8> addrspace(3)* %in %ext = sext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(3)* %out @@ -673,7 +673,7 @@ define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> ; EG: LDS_USHORT_READ_RET ; EG: LDS_WRITE -define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = zext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(3)* %out @@ -686,7 +686,7 @@ define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> ; EG: BFE_INT ; EG: BFE_INT ; EG: LDS_WRITE -define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { %load = load <2 x i8>, <2 x i8> addrspace(3)* %in %ext = sext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(3)* %out @@ -698,7 +698,7 @@ define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> ; EG: LDS_READ_RET ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = zext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(3)* %out @@ -715,7 +715,7 @@ define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> ; EG-DAG: BFE_INT ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { %load = load <4 x i8>, <4 x i8> addrspace(3)* %in %ext = sext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(3)* %out @@ -730,7 +730,7 @@ define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = zext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(3)* %out @@ -754,7 +754,7 @@ define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(3)* %in %ext = sext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(3)* %out @@ -775,7 +775,7 @@ define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = zext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(3)* %out @@ -813,7 +813,7 @@ define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { %load = load <16 x i8>, <16 x i8> addrspace(3)* %in %ext = sext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(3)* %out @@ -846,7 +846,7 @@ define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = zext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(3)* %out @@ -908,7 +908,7 @@ define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE -define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { %load = load <32 x i8>, <32 x i8> addrspace(3)* %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(3)* %out @@ -916,7 +916,7 @@ define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x } ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16: -; define void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in ; %ext = zext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out @@ -924,7 +924,7 @@ define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x ; } ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16: -; define void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { +; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in ; %ext = sext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll index bc5e4945fb04..d6162c388b5b 100644 --- a/test/CodeGen/AMDGPU/load-weird-sizes.ll +++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll @@ -8,7 +8,7 @@ ; SI: {{flat|buffer}}_load_ubyte ; SI: {{flat|buffer}}_load_ushort ; SI: {{flat|buffer}}_store_dword -define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 { +define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 { %1 = load i24, i24 addrspace(1)* %in %2 = zext i24 %1 to i32 store i32 %2, i32 addrspace(1)* %out @@ -21,7 +21,7 @@ define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 { ; CI-HSA: flat_load_dword [[VAL:v[0-9]+]] ; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]] -define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 { +define amdgpu_kernel void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 { %1 = load i25, i25 addrspace(1)* %in %2 = zext i25 %1 to i32 store i32 %2, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll index a7cee43187c1..bf4a93237bd4 100644 --- a/test/CodeGen/AMDGPU/local-64.ll +++ b/test/CodeGen/AMDGPU/local-64.ll @@ -5,7 +5,7 @@ ; BOTH-LABEL: {{^}}local_i32_load ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 ; BOTH: buffer_store_dword [[REG]], -define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 %val = load i32, i32 addrspace(3)* %gep, align 4 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -15,7 +15,7 @@ define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounw ; BOTH-LABEL: {{^}}local_i32_load_0_offset ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} ; BOTH: buffer_store_dword [[REG]], -define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { %val = load i32, i32 addrspace(3)* %in, align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void @@ -25,7 +25,7 @@ define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* % ; BOTH-NOT: ADD ; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 ; BOTH: buffer_store_byte [[REG]], -define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535 %val = load i8, i8 addrspace(3)* %gep, align 4 store i8 %val, i8 addrspace(1)* %out, align 4 @@ -40,7 +40,7 @@ define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3) ; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] ; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] ; BOTH: buffer_store_byte [[REG]], -define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536 %val = load i8, i8 addrspace(3)* %gep, align 4 store i8 %val, i8 addrspace(1)* %out, align 4 @@ -51,7 +51,7 @@ define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspa ; BOTH-NOT: ADD ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7 %val = load i64, i64 addrspace(3)* %gep, align 8 store i64 %val, i64 addrspace(1)* %out, align 8 @@ -61,7 +61,7 @@ define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounw ; BOTH-LABEL: {{^}}local_i64_load_0_offset ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { %val = load i64, i64 addrspace(3)* %in, align 8 store i64 %val, i64 addrspace(1)* %out, align 8 ret void @@ -71,7 +71,7 @@ define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* % ; BOTH-NOT: ADD ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { %gep = getelementptr double, double addrspace(3)* %in, i32 7 %val = load double, double addrspace(3)* %gep, align 8 store double %val, double addrspace(1)* %out, align 8 @@ -81,7 +81,7 @@ define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) ; BOTH-LABEL: {{^}}local_f64_load_0_offset ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { +define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { %val = load double, double addrspace(3)* %in, align 8 store double %val, double addrspace(1)* %out, align 8 ret void @@ -90,7 +90,7 @@ define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace ; BOTH-LABEL: {{^}}local_i64_store: ; BOTH-NOT: ADD ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define void @local_i64_store(i64 addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7 store i64 5678, i64 addrspace(3)* %gep, align 8 ret void @@ -99,7 +99,7 @@ define void @local_i64_store(i64 addrspace(3)* %out) nounwind { ; BOTH-LABEL: {{^}}local_i64_store_0_offset: ; BOTH-NOT: ADD ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { store i64 1234, i64 addrspace(3)* %out, align 8 ret void } @@ -107,7 +107,7 @@ define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { ; BOTH-LABEL: {{^}}local_f64_store: ; BOTH-NOT: ADD ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define void @local_f64_store(double addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind { %gep = getelementptr double, double addrspace(3)* %out, i32 7 store double 16.0, double addrspace(3)* %gep, align 8 ret void @@ -115,7 +115,7 @@ define void @local_f64_store(double addrspace(3)* %out) nounwind { ; BOTH-LABEL: {{^}}local_f64_store_0_offset ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { store double 20.0, double addrspace(3)* %out, align 8 ret void } @@ -124,7 +124,7 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { ; BOTH-NOT: ADD ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 ; BOTH: s_endpgm -define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 store <2 x i64> , <2 x i64> addrspace(3)* %gep, align 16 ret void @@ -134,7 +134,7 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { ; BOTH-NOT: ADD ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 ; BOTH: s_endpgm -define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 ret void } @@ -144,7 +144,7 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 ; BOTH: s_endpgm -define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 store <4 x i64> , <4 x i64> addrspace(3)* %gep, align 16 ret void @@ -155,7 +155,7 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 ; BOTH: s_endpgm -define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { +define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16 ret void } diff --git a/test/CodeGen/AMDGPU/local-atomics.ll b/test/CodeGen/AMDGPU/local-atomics.ll index 6714a28aa43a..de029d964b0d 100644 --- a/test/CodeGen/AMDGPU/local-atomics.ll +++ b/test/CodeGen/AMDGPU/local-atomics.ll @@ -11,7 +11,7 @@ ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* % ; EG: LDS_WRXCHG_RET * ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -37,7 +37,7 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -47,7 +47,7 @@ define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; EG: LDS_ADD_RET * ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -59,7 +59,7 @@ define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -73,7 +73,7 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -84,7 +84,7 @@ define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* % ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -96,7 +96,7 @@ define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -109,7 +109,7 @@ define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 a ; EG: LDS_SUB_RET * ; GCN: ds_sub_rtn_u32 ; GCN: s_endpgm -define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -119,7 +119,7 @@ define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; EG: LDS_SUB_RET * ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -131,7 +131,7 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm -define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -142,7 +142,7 @@ define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* % ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -153,7 +153,7 @@ define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac ; EG: LDS_AND_RET * ; GCN: ds_and_rtn_b32 ; GCN: s_endpgm -define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -163,7 +163,7 @@ define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; EG: LDS_AND_RET * ; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -174,7 +174,7 @@ define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; EG: LDS_OR_RET * ; GCN: ds_or_rtn_b32 ; GCN: s_endpgm -define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -184,7 +184,7 @@ define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %pt ; EG: LDS_OR_RET * ; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -195,7 +195,7 @@ define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace( ; EG: LDS_XOR_RET * ; GCN: ds_xor_rtn_b32 ; GCN: s_endpgm -define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -205,7 +205,7 @@ define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; EG: LDS_XOR_RET * ; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -214,7 +214,7 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; FIXME: There is no atomic nand instr ; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +; define amdgpu_kernel void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { ; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst ; store i32 %result, i32 addrspace(1)* %out, align 4 ; ret void @@ -224,7 +224,7 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; EG: LDS_MIN_INT_RET * ; GCN: ds_min_rtn_i32 ; GCN: s_endpgm -define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -234,7 +234,7 @@ define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; EG: LDS_MIN_INT_RET * ; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -245,7 +245,7 @@ define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; EG: LDS_MAX_INT_RET * ; GCN: ds_max_rtn_i32 ; GCN: s_endpgm -define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -255,7 +255,7 @@ define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; EG: LDS_MAX_INT_RET * ; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -266,7 +266,7 @@ define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; EG: LDS_MIN_UINT_RET * ; GCN: ds_min_rtn_u32 ; GCN: s_endpgm -define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -276,7 +276,7 @@ define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* % ; EG: LDS_MIN_UINT_RET * ; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -287,7 +287,7 @@ define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac ; EG: LDS_MAX_UINT_RET * ; GCN: ds_max_rtn_u32 ; GCN: s_endpgm -define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -297,7 +297,7 @@ define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* % ; EG: LDS_MAX_UINT_RET * ; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst store i32 %result, i32 addrspace(1)* %out, align 4 @@ -310,7 +310,7 @@ define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -318,7 +318,7 @@ define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset: ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -330,7 +330,7 @@ define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_add_u32 [[VPTR]], [[DATA]] ; GCN: s_endpgm -define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -338,7 +338,7 @@ define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset: ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -348,7 +348,7 @@ define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -360,7 +360,7 @@ define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst ret void } @@ -369,7 +369,7 @@ define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst ret void @@ -379,7 +379,7 @@ define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add @@ -390,7 +390,7 @@ define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32: ; GCN: ds_sub_u32 ; GCN: s_endpgm -define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -398,7 +398,7 @@ define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset: ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -408,7 +408,7 @@ define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm -define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst ret void } @@ -417,7 +417,7 @@ define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm -define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst ret void @@ -426,7 +426,7 @@ define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32: ; GCN: ds_and_b32 ; GCN: s_endpgm -define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -434,7 +434,7 @@ define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset: ; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -443,7 +443,7 @@ define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32: ; GCN: ds_or_b32 ; GCN: s_endpgm -define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -451,7 +451,7 @@ define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset: ; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -460,7 +460,7 @@ define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32: ; GCN: ds_xor_b32 ; GCN: s_endpgm -define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -468,7 +468,7 @@ define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset: ; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -476,7 +476,7 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; FIXME: There is no atomic nand instr ; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind { +; define amdgpu_kernel void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst ; ret void ; } @@ -484,7 +484,7 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32: ; GCN: ds_min_i32 ; GCN: s_endpgm -define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -492,7 +492,7 @@ define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset: ; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -501,7 +501,7 @@ define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32: ; GCN: ds_max_i32 ; GCN: s_endpgm -define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -509,7 +509,7 @@ define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset: ; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -518,7 +518,7 @@ define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32: ; GCN: ds_min_u32 ; GCN: s_endpgm -define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -526,7 +526,7 @@ define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset: ; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst ret void @@ -535,7 +535,7 @@ define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32: ; GCN: ds_max_u32 ; GCN: s_endpgm -define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst ret void } @@ -543,7 +543,7 @@ define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset: ; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm -define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst ret void diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll index c88917812eda..6572a7bcd4fe 100644 --- a/test/CodeGen/AMDGPU/local-atomics64.ll +++ b/test/CodeGen/AMDGPU/local-atomics64.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64: ; GCN: ds_wrxchg_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -13,7 +13,7 @@ define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* % ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset: ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -23,7 +23,7 @@ define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64: ; GCN: ds_add_rtn_u64 ; GCN: s_endpgm -define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -38,7 +38,7 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -51,7 +51,7 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -60,7 +60,7 @@ define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* % ; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset: ; GCN: ds_add_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -70,7 +70,7 @@ define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64: ; GCN: ds_sub_rtn_u64 ; GCN: s_endpgm -define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -79,7 +79,7 @@ define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset: ; GCN: ds_sub_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -92,7 +92,7 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace ; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm -define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -101,7 +101,7 @@ define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* % ; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset: ; GCN: ds_sub_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -111,7 +111,7 @@ define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64: ; GCN: ds_and_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -120,7 +120,7 @@ define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset: ; GCN: ds_and_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -130,7 +130,7 @@ define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64: ; GCN: ds_or_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -139,7 +139,7 @@ define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %pt ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset: ; GCN: ds_or_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -149,7 +149,7 @@ define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace( ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64: ; GCN: ds_xor_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -158,7 +158,7 @@ define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset: ; GCN: ds_xor_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -167,7 +167,7 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace ; FIXME: There is no atomic nand instr ; XGCN-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +; define amdgpu_kernel void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { ; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst ; store i64 %result, i64 addrspace(1)* %out, align 8 ; ret void @@ -176,7 +176,7 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64: ; GCN: ds_min_rtn_i64 ; GCN: s_endpgm -define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -185,7 +185,7 @@ define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset: ; GCN: ds_min_rtn_i64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -195,7 +195,7 @@ define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64: ; GCN: ds_max_rtn_i64 ; GCN: s_endpgm -define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -204,7 +204,7 @@ define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset: ; GCN: ds_max_rtn_i64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -214,7 +214,7 @@ define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64: ; GCN: ds_min_rtn_u64 ; GCN: s_endpgm -define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -223,7 +223,7 @@ define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* % ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset: ; GCN: ds_min_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -233,7 +233,7 @@ define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64: ; GCN: ds_max_rtn_u64 ; GCN: s_endpgm -define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -242,7 +242,7 @@ define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* % ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset: ; GCN: ds_max_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst store i64 %result, i64 addrspace(1)* %out, align 8 @@ -252,7 +252,7 @@ define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64: ; GCN: ds_wrxchg_rtn_b64 ; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -260,7 +260,7 @@ define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset: ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -269,7 +269,7 @@ define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_add_noret_i64: ; GCN: ds_add_u64 ; GCN: s_endpgm -define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -282,7 +282,7 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst ret void @@ -293,7 +293,7 @@ define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst ret void } @@ -301,7 +301,7 @@ define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset: ; GCN: ds_add_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst ret void @@ -310,7 +310,7 @@ define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64: ; GCN: ds_sub_u64 ; GCN: s_endpgm -define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -318,7 +318,7 @@ define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset: ; GCN: ds_sub_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -329,7 +329,7 @@ define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} ; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: s_endpgm -define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst ret void } @@ -337,7 +337,7 @@ define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset: ; GCN: ds_sub_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst ret void @@ -346,7 +346,7 @@ define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64: ; GCN: ds_and_b64 ; GCN: s_endpgm -define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -354,7 +354,7 @@ define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset: ; GCN: ds_and_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -363,7 +363,7 @@ define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64: ; GCN: ds_or_b64 ; GCN: s_endpgm -define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -371,7 +371,7 @@ define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset: ; GCN: ds_or_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -380,7 +380,7 @@ define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64: ; GCN: ds_xor_b64 ; GCN: s_endpgm -define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -388,7 +388,7 @@ define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset: ; GCN: ds_xor_b64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -396,7 +396,7 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; FIXME: There is no atomic nand instr ; XGCN-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind { +; define amdgpu_kernel void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst ; ret void ; } @@ -404,7 +404,7 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64: ; GCN: ds_min_i64 ; GCN: s_endpgm -define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -412,7 +412,7 @@ define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset: ; GCN: ds_min_i64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -421,7 +421,7 @@ define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64: ; GCN: ds_max_i64 ; GCN: s_endpgm -define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -429,7 +429,7 @@ define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset: ; GCN: ds_max_i64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -438,7 +438,7 @@ define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64: ; GCN: ds_min_u64 ; GCN: s_endpgm -define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -446,7 +446,7 @@ define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset: ; GCN: ds_min_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst ret void @@ -455,7 +455,7 @@ define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64: ; GCN: ds_max_u64 ; GCN: s_endpgm -define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst ret void } @@ -463,7 +463,7 @@ define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset: ; GCN: ds_max_u64 {{.*}} offset:32 ; GCN: s_endpgm -define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst ret void diff --git a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index a57e4f595322..47b6558241b9 100644 --- a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -17,7 +17,7 @@ ; GCN: s_barrier ; GCN: ds_read_b32 {{v[0-9]+}}, -define void @local_memory(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 { entry: %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i @@ -45,11 +45,7 @@ entry: ; GCN-LABEL: {{^}}local_memory_two_objects: ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 - -; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]] - -; SI-DAG: ds_write_b32 [[ADDRW]], -; SI-DAG: ds_write_b32 [[ADDRW_OFF]], +; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; GCN: s_barrier @@ -61,7 +57,7 @@ entry: ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 -define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll index 1a11332f865d..6124237d7638 100644 --- a/test/CodeGen/AMDGPU/local-memory.ll +++ b/test/CodeGen/AMDGPU/local-memory.ll @@ -14,7 +14,7 @@ ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4 ; R600: LDS_READ_RET -define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 { entry: %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1 %tmp1 = load i32, i32 addrspace(3)* %tmp0 @@ -30,7 +30,7 @@ entry: ; R600: LDS_READ_RET ; GCN-DAG: ds_read_b32 ; GCN-DAG: ds_read2_b32 -define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 { +define amdgpu_kernel void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 { %scalar = load i32, i32 addrspace(3)* %in %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)* %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2 diff --git a/test/CodeGen/AMDGPU/local-memory.r600.ll b/test/CodeGen/AMDGPU/local-memory.r600.ll index 9841b8882b39..c8f4e4c986a7 100644 --- a/test/CodeGen/AMDGPU/local-memory.r600.ll +++ b/test/CodeGen/AMDGPU/local-memory.r600.ll @@ -15,7 +15,7 @@ ; EG-NEXT: ALU clause ; EG: LDS_READ_RET -define void @local_memory(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 { entry: %y.i = call i32 @llvm.r600.read.tidig.x() #1 %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i @@ -57,7 +57,7 @@ entry: ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] -define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { entry: %x.i = call i32 @llvm.r600.read.tidig.x() #1 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll index dc43e8613ddf..d3e0f0be4b5f 100644 --- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll +++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll @@ -8,13 +8,12 @@ ; CHECK-LABEL: {{^}}main: ; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200 -; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}} ; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] -; TODO: add 0? -; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] -; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] +; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] +; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen diff --git a/test/CodeGen/AMDGPU/loop-address.ll b/test/CodeGen/AMDGPU/loop-address.ll index f60d574497de..e25d4f4b4f5f 100644 --- a/test/CodeGen/AMDGPU/loop-address.ll +++ b/test/CodeGen/AMDGPU/loop-address.ll @@ -5,7 +5,7 @@ ;CHECK: LOOP_BREAK @10 ;CHECK: POP @10 -define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { +define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { entry: %cmp5 = icmp sgt i32 %iterations, 0 br i1 %cmp5, label %for.body, label %for.end diff --git a/test/CodeGen/AMDGPU/loop-idiom.ll b/test/CodeGen/AMDGPU/loop-idiom.ll index 5fd9806813cd..23ddd6488af9 100644 --- a/test/CodeGen/AMDGPU/loop-idiom.ll +++ b/test/CodeGen/AMDGPU/loop-idiom.ll @@ -9,7 +9,7 @@ ; FUNC: @no_memcpy ; R600-NOT: {{^}}llvm.memcpy ; SI-NOT: {{^}}llvm.memcpy -define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) { +define amdgpu_kernel void @no_memcpy(i8 addrspace(3)* %in, i32 %size) { entry: %dest = alloca i8, i32 32 br label %for.body @@ -33,7 +33,7 @@ for.end: ; R600-NOT: {{^}}memset_pattern16: ; SI-NOT: {{^}}llvm.memset ; SI-NOT: {{^}}memset_pattern16: -define void @no_memset(i32 %size) { +define amdgpu_kernel void @no_memset(i32 %size) { entry: %dest = alloca i8, i32 32 br label %for.body diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll index 82564b8bb28d..b9df2cb779ad 100644 --- a/test/CodeGen/AMDGPU/loop_break.ll +++ b/test/CodeGen/AMDGPU/loop_break.ll @@ -27,8 +27,9 @@ ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1 ; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]] -; GCN: s_cmp_gt_i32 s{{[0-9]+}}, -1 -; GCN-NEXT: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]] +; GCN: v_cmp_lt_i32_e32 vcc, -1 +; GCN: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: ; BB#2: ; %bb4 ; GCN: buffer_load_dword @@ -43,7 +44,7 @@ ; GCN: ; BB#4: ; %bb9 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] ; GCN-NEXT: s_endpgm -define void @break_loop(i32 %arg) #0 { +define amdgpu_kernel void @break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg @@ -64,6 +65,264 @@ bb9: ret void } +; OPT-LABEL: @undef_phi_cond_break_loop( +; OPT: bb1: +; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ] +; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] +; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 undef, i64 %phi.broken) +; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow + +; OPT: bb4: +; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4 +; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load +; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken) +; OPT-NEXT: br label %Flow + +; OPT: Flow: +; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ] +; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] +; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) +; OPT-NEXT: br i1 %2, label %bb9, label %bb1 + +; OPT: bb9: ; preds = %Flow +; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; OPT-NEXT: store volatile i32 7 +; OPT-NEXT: ret void +define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %tmp = sub i32 %id, %arg + br label %bb1 + +bb1: ; preds = %Flow, %bb + %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] + %lsr.iv.next = add i32 %lsr.iv, 1 + %cmp0 = icmp slt i32 %lsr.iv.next, 0 + br i1 %cmp0, label %bb4, label %Flow + +bb4: ; preds = %bb1 + %load = load volatile i32, i32 addrspace(1)* undef, align 4 + %cmp1 = icmp sge i32 %tmp, %load + br label %Flow + +Flow: ; preds = %bb4, %bb1 + %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] + %tmp3 = phi i1 [ %cmp1, %bb4 ], [ undef, %bb1 ] + br i1 %tmp3, label %bb9, label %bb1 + +bb9: ; preds = %Flow + store volatile i32 7, i32 addrspace(3)* undef + ret void +} + +; FIXME: ConstantExpr compare of address to null folds away +@lds = addrspace(3) global i32 undef + +; OPT-LABEL: @constexpr_phi_cond_break_loop( +; OPT: bb1: +; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ] +; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] +; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), i64 %phi.broken) +; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow + +; OPT: bb4: +; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4 +; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load +; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken) +; OPT-NEXT: br label %Flow + +; OPT: Flow: +; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ] +; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] +; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) +; OPT-NEXT: br i1 %2, label %bb9, label %bb1 + +; OPT: bb9: ; preds = %Flow +; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; OPT-NEXT: store volatile i32 7 +; OPT-NEXT: ret void +define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %tmp = sub i32 %id, %arg + br label %bb1 + +bb1: ; preds = %Flow, %bb + %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] + %lsr.iv.next = add i32 %lsr.iv, 1 + %cmp0 = icmp slt i32 %lsr.iv.next, 0 + br i1 %cmp0, label %bb4, label %Flow + +bb4: ; preds = %bb1 + %load = load volatile i32, i32 addrspace(1)* undef, align 4 + %cmp1 = icmp sge i32 %tmp, %load + br label %Flow + +Flow: ; preds = %bb4, %bb1 + %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] + %tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ] + br i1 %tmp3, label %bb9, label %bb1 + +bb9: ; preds = %Flow + store volatile i32 7, i32 addrspace(3)* undef + ret void +} + +; OPT-LABEL: @true_phi_cond_break_loop( +; OPT: bb1: +; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ] +; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] +; OPT: %0 = call i64 @llvm.amdgcn.break(i64 %phi.broken) +; OPT: br i1 %cmp0, label %bb4, label %Flow + +; OPT: bb4: +; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4 +; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load +; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken) +; OPT-NEXT: br label %Flow + +; OPT: Flow: +; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ] +; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] +; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) +; OPT-NEXT: br i1 %2, label %bb9, label %bb1 + +; OPT: bb9: ; preds = %Flow +; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; OPT-NEXT: store volatile i32 7 +; OPT-NEXT: ret void +define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %tmp = sub i32 %id, %arg + br label %bb1 + +bb1: ; preds = %Flow, %bb + %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] + %lsr.iv.next = add i32 %lsr.iv, 1 + %cmp0 = icmp slt i32 %lsr.iv.next, 0 + br i1 %cmp0, label %bb4, label %Flow + +bb4: ; preds = %bb1 + %load = load volatile i32, i32 addrspace(1)* undef, align 4 + %cmp1 = icmp sge i32 %tmp, %load + br label %Flow + +Flow: ; preds = %bb4, %bb1 + %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] + %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ] + br i1 %tmp3, label %bb9, label %bb1 + +bb9: ; preds = %Flow + store volatile i32 7, i32 addrspace(3)* undef + ret void +} + +; OPT-LABEL: @false_phi_cond_break_loop( +; OPT: bb1: +; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ] +; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] +; OPT-NOT: call +; OPT: br i1 %cmp0, label %bb4, label %Flow + +; OPT: bb4: +; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4 +; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load +; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken) +; OPT-NEXT: br label %Flow + +; OPT: Flow: +; OPT-NEXT: %loop.phi = phi i64 [ %0, %bb4 ], [ %phi.broken, %bb1 ] +; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] +; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) +; OPT-NEXT: br i1 %1, label %bb9, label %bb1 + +; OPT: bb9: ; preds = %Flow +; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; OPT-NEXT: store volatile i32 7 +; OPT-NEXT: ret void +define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %tmp = sub i32 %id, %arg + br label %bb1 + +bb1: ; preds = %Flow, %bb + %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] + %lsr.iv.next = add i32 %lsr.iv, 1 + %cmp0 = icmp slt i32 %lsr.iv.next, 0 + br i1 %cmp0, label %bb4, label %Flow + +bb4: ; preds = %bb1 + %load = load volatile i32, i32 addrspace(1)* undef, align 4 + %cmp1 = icmp sge i32 %tmp, %load + br label %Flow + +Flow: ; preds = %bb4, %bb1 + %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] + %tmp3 = phi i1 [ %cmp1, %bb4 ], [ false, %bb1 ] + br i1 %tmp3, label %bb9, label %bb1 + +bb9: ; preds = %Flow + store volatile i32 7, i32 addrspace(3)* undef + ret void +} + +; Swap order of branches in flow block so that the true phi is +; continue. + +; OPT-LABEL: @invert_true_phi_cond_break_loop( +; OPT: bb1: +; OPT-NEXT: %phi.broken = phi i64 [ %1, %Flow ], [ 0, %bb ] +; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] +; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1 +; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0 +; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow + +; OPT: bb4: +; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4 +; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load +; OPT-NEXT: br label %Flow + +; OPT: Flow: +; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] +; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ] +; OPT-NEXT: %0 = xor i1 %tmp3, true +; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %0, i64 %phi.broken) +; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %1) +; OPT-NEXT: br i1 %2, label %bb9, label %bb1 + +; OPT: bb9: +; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %1) +; OPT-NEXT: store volatile i32 7, i32 addrspace(3)* undef +; OPT-NEXT: ret void +define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %tmp = sub i32 %id, %arg + br label %bb1 + +bb1: ; preds = %Flow, %bb + %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] + %lsr.iv.next = add i32 %lsr.iv, 1 + %cmp0 = icmp slt i32 %lsr.iv.next, 0 + br i1 %cmp0, label %bb4, label %Flow + +bb4: ; preds = %bb1 + %load = load volatile i32, i32 addrspace(1)* undef, align 4 + %cmp1 = icmp sge i32 %tmp, %load + br label %Flow + +Flow: ; preds = %bb4, %bb1 + %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] + %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ] + br i1 %tmp3, label %bb1, label %bb9 + +bb9: ; preds = %Flow + store volatile i32 7, i32 addrspace(3)* undef + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll new file mode 100644 index 000000000000..74564f387ede --- /dev/null +++ b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -0,0 +1,117 @@ +; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s + +declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 + +declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1) #1 + +; Test the upper bound for sizes to leave +; OPT-LABEL: @max_size_small_static_memcpy_caller0( +; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) +define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) + ret void +} + +; Smallest static size which will be expanded +; OPT-LABEL: @min_size_large_static_memcpy_caller0( +; OPT-NOT: call +; OPT: getelementptr +; OPT-NEXT: load i8 +; OPT: getelementptr +; OPT-NEXT: store i8 +define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @max_size_small_static_memmove_caller0( +; OPT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) +define amdgpu_kernel void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { + call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @min_size_large_static_memmove_caller0( +; OPT-NOT: call +; OPT: getelementptr +; OPT-NEXT: load i8 +; OPT: getelementptr +; OPT-NEXT: store i8 +define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { + call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @max_size_small_static_memset_caller0( +; OPT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false) +define amdgpu_kernel void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @min_size_large_static_memset_caller0( +; OPT-NOT: call +; OPT: getelementptr +; OPT: store i8 +define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 { + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @variable_memcpy_caller0( +; OPT-NOT: call +; OPT: phi +define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @variable_memcpy_caller1( +; OPT-NOT: call +; OPT: phi +define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @memcpy_multi_use_one_function( +; OPT-NOT: call +; OPT: phi +; OPT-NOT: call +; OPT: phi +; OPT-NOT: call +define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i32 1, i1 false) + ret void +} + +; OPT-LABEL: @memcpy_alt_type( +; OPT: phi +; OPT: getelementptr inbounds i8, i8 addrspace(3)* +; OPT: load i8, i8 addrspace(3)* +; OPT: getelementptr inbounds i8, i8 addrspace(1)* +; OPT: store i8 +define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 { + call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i32 1, i1 false) + ret void +} + +; One of the uses in the function should be expanded, the other left alone. +; OPT-LABEL: @memcpy_multi_use_one_function_keep_small( +; OPT: getelementptr inbounds i8, i8 addrspace(1)* +; OPT: load i8, i8 addrspace(1)* +; OPT: getelementptr inbounds i8, i8 addrspace(1)* +; OPT: store i8 + +; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false) +define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 { + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false) + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false) + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } diff --git a/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll index e1fad13e0b51..6f5f4ca13b5e 100644 --- a/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll +++ b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range: ; CHECK-NOT: v0 ; CHECK: {{flat|buffer}}_store_dword {{.*}}v0 -define void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 { +define amdgpu_kernel void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 %and = and i32 %id, 1023 @@ -14,9 +14,9 @@ entry: } ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range: -; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0 -; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]] -define void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 { +; CHECK-NOT: v_and_b32 +; CHECK: {{flat|buffer}}_store_dword {{.*}}v0 +define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 %and = and i32 %id, 511 @@ -26,9 +26,9 @@ entry: ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1: ; CHECK-NOT: v0 -; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0 -; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]] -define void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 { +; CHECK-NOT: v_and_b32 +; CHECK: {{flat|buffer}}_store_dword {{.*}}v0 +define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1 %and = and i32 %id, 255 diff --git a/test/CodeGen/AMDGPU/lshl.ll b/test/CodeGen/AMDGPU/lshl.ll deleted file mode 100644 index 8468437c2c1f..000000000000 --- a/test/CodeGen/AMDGPU/lshl.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s - -;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1 - -define void @test(i32 %p) { - %i = mul i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/lshr.ll b/test/CodeGen/AMDGPU/lshr.ll deleted file mode 100644 index c8ab7871434e..000000000000 --- a/test/CodeGen/AMDGPU/lshr.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s - -;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1 - -define void @test(i32 %p) { - %i = udiv i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll new file mode 100644 index 000000000000..e21d0d09bb41 --- /dev/null +++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -0,0 +1,149 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s + +; GCN-LABEL: {{^}}s_lshr_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] + +; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { + %result = lshr <2 x i16> %lhs, %rhs + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_lshr_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] + +; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_lshrrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]] +; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[RHS]] +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr + %result = lshr <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_v_s_v2i16: +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = lshr <2 x i16> %vgpr, %sgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_s_v_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = lshr <2 x i16> %sgpr, %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_imm_v_v2i16: +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8 +define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = lshr <2 x i16> , %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_v_imm_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]] +define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = lshr <2 x i16> %vgpr, + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_lshr_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr + %result = lshr <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_v_imm_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %result = lshr <4 x i16> %vgpr, + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll index 9caba32cbac0..b855fc500c6b 100644 --- a/test/CodeGen/AMDGPU/mad-combine.ll +++ b/test/CodeGen/AMDGPU/mad-combine.ll @@ -31,7 +31,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #0 ; SI-DENORM: buffer_store_dword [[RESULT]] ; SI-STD: buffer_store_dword [[C]] -define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -70,7 +70,7 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -107,7 +107,7 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a ; SI-DENORM: buffer_store_dword [[RESULT]] ; SI-STD: buffer_store_dword [[C]] -define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -137,7 +137,7 @@ define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrsp ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -174,7 +174,7 @@ define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float a ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -209,7 +209,7 @@ define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, fl ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] ; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -245,7 +245,7 @@ define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float a ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -281,7 +281,7 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -319,7 +319,7 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -362,7 +362,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %ou ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -404,7 +404,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %ou ; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -447,7 +447,7 @@ define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %o ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -497,7 +497,7 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -548,7 +548,7 @@ define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %o ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 diff --git a/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/test/CodeGen/AMDGPU/mad24-get-global-id.ll index 9183ae0972dc..1e78c4ebcc9f 100644 --- a/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -11,7 +11,7 @@ declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]] ; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0 -define void @get_global_id_0(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 { %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1 diff --git a/test/CodeGen/AMDGPU/mad_int24.ll b/test/CodeGen/AMDGPU/mad_int24.ll index f149ea0a6a0e..af0159aa9b10 100644 --- a/test/CodeGen/AMDGPU/mad_int24.ll +++ b/test/CodeGen/AMDGPU/mad_int24.ll @@ -11,7 +11,7 @@ ; CM: MULADD_INT24 ; SI-NOT: and ; SI: v_mad_i32_i24 -define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %0 = shl i32 %a, 8 %a_24 = ashr i32 %0, 8 diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll index 9fde950f822c..2c4f7d324a96 100644 --- a/test/CodeGen/AMDGPU/mad_uint24.ll +++ b/test/CodeGen/AMDGPU/mad_uint24.ll @@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; SI: v_mad_u32_u24 ; VI: v_mad_u32_u24 -define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %0 = shl i32 %a, 8 %a_24 = lshr i32 %0, 8 @@ -32,7 +32,7 @@ entry: ; FIXME: Should be using scalar instructions here. ; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 -define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { +define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { entry: %0 = mul i16 %a, %b %1 = add i16 %0, %c @@ -49,7 +49,7 @@ entry: ; EG: 8 ; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 -define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { +define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { entry: %0 = mul i8 %a, %b %1 = add i8 %0, %c @@ -68,7 +68,7 @@ entry: ; FUNC-LABEL: {{^}}i24_i32_i32_mad: ; EG: CNDE_INT ; SI: v_cndmask -define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { entry: %0 = ashr i32 %a, 8 %1 = icmp ne i32 %c, 0 diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll index 6722aa79dd5d..eb4066a2a0a8 100644 --- a/test/CodeGen/AMDGPU/madak.ll +++ b/test/CodeGen/AMDGPU/madak.ll @@ -10,7 +10,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] ; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -37,7 +37,7 @@ define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa ; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]] ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]] ; GCN: s_endpgm -define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -64,7 +64,7 @@ define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1 ; GCN-LABEL: {{^}}madak_m_inline_imm_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 -define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { +define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -84,7 +84,7 @@ define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addr ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 -define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -106,7 +106,7 @@ define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] -define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { +define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -125,7 +125,7 @@ define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] -define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -141,7 +141,7 @@ define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float add ; GCN-LABEL: {{^}}s_s_madak_f32: ; GCN-NOT: v_madak_f32 ; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { %mul = fmul float %a, %b %madak = fadd float %mul, 10.0 store float %madak, float addrspace(1)* %out, align 4 @@ -153,7 +153,7 @@ define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwin ; GCN: buffer_load_dword [[VB:v[0-9]+]] ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} ; GCN: s_endpgm -define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -175,7 +175,7 @@ define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float ; GCN: buffer_load_dword [[VB:v[0-9]+]] ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} ; GCN: s_endpgm -define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -201,7 +201,7 @@ define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float ; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]] ; GCN: buffer_store_dword [[MUL]] -define void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 { +define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 { bb: %tmp = icmp eq i32 %arg1, 0 br i1 %tmp, label %bb3, label %bb4 diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll index 27fbf58d26c6..6e70e95383c9 100644 --- a/test/CodeGen/AMDGPU/madmk.ll +++ b/test/CodeGen/AMDGPU/madmk.ll @@ -12,7 +12,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]] -define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -35,7 +35,7 @@ define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa ; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]] ; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]] ; GCN: s_endpgm -define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -64,7 +64,7 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]] -define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -83,7 +83,7 @@ define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp ; GCN-NOT: v_madmk_f32 ; GCN: v_mac_f32_e32 ; GCN: s_endpgm -define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { +define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -97,7 +97,7 @@ define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) ; GCN-NOT: v_madmk_f32 ; GCN: v_mad_f32 ; GCN: s_endpgm -define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { +define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -113,7 +113,7 @@ define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* ; GCN-NOT: v_madmk_f32 ; GCN: v_mac_f32_e32 ; GCN: s_endpgm -define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { +define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -130,7 +130,7 @@ define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float add ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], |[[VA]]|, [[VB]] -define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -151,7 +151,7 @@ define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}| -define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -172,7 +172,7 @@ define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float ; GCN: buffer_load_dword [[A:v[0-9]+]] ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0 -define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -189,7 +189,7 @@ define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float ad ; SI: s_xor_b64 ; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}} ; SI: s_or_b64 -define void @kill_madmk_verifier_error() nounwind { +define amdgpu_kernel void @kill_madmk_verifier_error() nounwind { bb: br label %bb2 diff --git a/test/CodeGen/AMDGPU/max.i16.ll b/test/CodeGen/AMDGPU/max.i16.ll index 3f2a87f20691..abd75258c4d4 100644 --- a/test/CodeGen/AMDGPU/max.i16.ll +++ b/test/CodeGen/AMDGPU/max.i16.ll @@ -1,12 +1,10 @@ -; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s - - -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VIPLUS %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VIPLUS %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sge_i16: -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +; VIPLUS: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -20,12 +18,56 @@ define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr } ; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_imax_sge_v4i16: +; GCN-LABEL: {{^}}v_test_imax_sge_v2i16: ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4 + %cmp = icmp sge <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4 + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_imax_sge_v3i16: +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-NOT: v_max_i16 + +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid + %a = load <3 x i16>, <3 x i16> addrspace(1)* %gep0, align 4 + %b = load <3 x i16>, <3 x i16> addrspace(1)* %gep1, align 4 + %cmp = icmp sge <3 x i16> %a, %b + %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b + store <3 x i16> %val, <3 x i16> addrspace(1)* %outgep, align 4 + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_imax_sge_v4i16: ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind { +; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid @@ -40,8 +82,8 @@ define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrs ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sgt_i16: -; VI: v_max_i16_e32 -define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +; VIPLUS: v_max_i16_e32 +define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -56,8 +98,8 @@ define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_umax_uge_i16: -; VI: v_max_u16_e32 -define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +; VIPLUS: v_max_u16_e32 +define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -72,8 +114,8 @@ define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_umax_ugt_i16: -; VI: v_max_u16_e32 -define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +; VIPLUS: v_max_u16_e32 +define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -85,3 +127,23 @@ define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr store i16 %val, i16 addrspace(1)* %outgep, align 4 ret void } + +; GCN-LABEL: {{^}}v_test_umax_ugt_v2i16: +; VI: v_max_u16_e32 +; VI: v_max_u16_sdwa + +; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4 + %cmp = icmp ugt <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll index 5fa307be0fd5..ffcdac03bc74 100644 --- a/test/CodeGen/AMDGPU/max.ll +++ b/test/CodeGen/AMDGPU/max.ll @@ -6,7 +6,7 @@ ; SI: v_max_i32_e32 ; EG: MAX_INT -define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp sge i32 %a, %b @@ -26,7 +26,7 @@ define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; EG: MAX_INT ; EG: MAX_INT ; EG: MAX_INT -define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind { %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4 %b = load <4 x i32>, <4 x i32> addrspace(1)* %bptr, align 4 %cmp = icmp sge <4 x i32> %a, %b @@ -39,7 +39,7 @@ define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrs ; SI: s_max_i32 ; EG: MAX_INT -define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp sge i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -50,7 +50,7 @@ define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { %cmp = icmp sge i32 %a, 9 %val = select i1 %cmp, i32 %a, i32 9 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -63,7 +63,7 @@ define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; SI: v_max_i32_e32 ; EG: MAX_INT -define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { %a = load i8, i8 addrspace(1)* %aptr, align 1 %b = load i8, i8 addrspace(1)* %bptr, align 1 %cmp = icmp sge i8 %a, %b @@ -76,7 +76,7 @@ define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { %cmp = icmp sgt i32 %a, 9 %val = select i1 %cmp, i32 %a, i32 9 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -89,7 +89,7 @@ define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; EG: MAX_INT {{.*}}literal.{{[xyzw]}} ; EG: MAX_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { +define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { %cmp = icmp sgt <2 x i32> %a, %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 @@ -100,7 +100,7 @@ define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % ; SI: v_max_i32_e32 ; EG: MAX_INT -define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp sgt i32 %a, %b @@ -113,7 +113,7 @@ define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; SI: s_max_i32 ; EG: MAX_INT -define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp sgt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -124,7 +124,7 @@ define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; SI: v_max_u32_e32 ; EG: MAX_UINT -define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp uge i32 %a, %b @@ -137,7 +137,7 @@ define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; SI: s_max_u32 ; EG: MAX_UINT -define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp uge i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -155,7 +155,7 @@ define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; EG: MAX_UINT ; EG: MAX_UINT ; EG-NOT: MAX_UINT -define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind { +define amdgpu_kernel void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind { %cmp = icmp uge <3 x i32> %a, %b %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4 @@ -168,7 +168,7 @@ define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, < ; SI: v_max_u32_e32 ; EG: MAX_UINT -define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { %a = load i8, i8 addrspace(1)* %aptr, align 1 %b = load i8, i8 addrspace(1)* %bptr, align 1 %cmp = icmp uge i8 %a, %b @@ -181,7 +181,7 @@ define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i ; SI: v_max_u32_e32 ; EG: MAX_UINT -define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp ugt i32 %a, %b @@ -194,7 +194,7 @@ define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; SI: s_max_u32 ; EG: MAX_UINT -define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp ugt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -207,7 +207,7 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}} ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}} -define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { +define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { %cmp = icmp ugt <2 x i32> %a, %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 @@ -223,7 +223,7 @@ define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % ; SI: buffer_store_dword [[VMAX]] ; EG: MAX_UINT -define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { +define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 %cmp = icmp ugt i32 %a.ext, %b.ext @@ -243,7 +243,7 @@ define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i1 ; SI: buffer_store_dword [[VMAX]] ; EG: MAX_INT -define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { +define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 %cmp = icmp sgt i32 %a.ext, %b.ext @@ -262,7 +262,7 @@ define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 ; SI: s_max_i32 ; EG: MAX_INT -define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { +define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { %cmp = icmp sge i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b store i16 %val, i16 addrspace(1)* %out @@ -275,7 +275,7 @@ define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin ; EG: MAX_UINT ; EG: MAX_UINT -define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %tmp = icmp ugt i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -287,7 +287,7 @@ define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind ; EG: MAX_UINT ; EG: MAX_UINT -define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %tmp = icmp uge i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -299,7 +299,7 @@ define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind ; EG-DAG: MAX_UINT ; EG-DAG: MAX_INT -define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %tmp = icmp sgt i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -311,7 +311,7 @@ define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind ; EG-DAG: MAX_UINT ; EG-DAG: MAX_INT -define void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %tmp = icmp sge i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll index a12dba2eb6e9..4bb4fd46becd 100644 --- a/test/CodeGen/AMDGPU/max3.ll +++ b/test/CodeGen/AMDGPU/max3.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; FUNC-LABEL: @v_test_imax3_sgt_i32 ; SI: v_max3_i32 -define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -23,7 +23,7 @@ define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt ; FUNC-LABEL: @v_test_umax3_ugt_i32 ; SI: v_max3_u32 -define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid diff --git a/test/CodeGen/AMDGPU/mem-builtins.ll b/test/CodeGen/AMDGPU/mem-builtins.ll index 97512670f59e..1cbd0c327510 100644 --- a/test/CodeGen/AMDGPU/mem-builtins.ll +++ b/test/CodeGen/AMDGPU/mem-builtins.ll @@ -9,7 +9,7 @@ declare i32 @strcmp(i8* nocapture, i8* nocapture) #1 ; ERROR: error: :0:0: in function test_memcmp void (i8 addrspace(1)*, i8 addrspace(1)*, i32*): unsupported call to function memcmp -define void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 { +define amdgpu_kernel void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 { entry: %cmp = tail call i32 @memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i64 2) store volatile i32 %cmp, i32 addrspace(1)* undef @@ -17,35 +17,35 @@ entry: } ; ERROR: error: :0:0: in function test_memchr void (i8 addrspace(1)*, i32, i64): unsupported call to function memchr -define void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 { +define amdgpu_kernel void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 { %res = call i8 addrspace(1)* @memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) store volatile i8 addrspace(1)* %res, i8 addrspace(1)* addrspace(1)* undef ret void } ; ERROR: error: :0:0: in function test_strcpy void (i8*, i8*): unsupported call to function strcpy -define void @test_strcpy(i8* %dst, i8* %src) #0 { +define amdgpu_kernel void @test_strcpy(i8* %dst, i8* %src) #0 { %res = call i8* @strcpy(i8* %dst, i8* %src) store volatile i8* %res, i8* addrspace(1)* undef ret void } ; ERROR: error: :0:0: in function test_strcmp void (i8*, i8*): unsupported call to function strcmp -define void @test_strcmp(i8* %src0, i8* %src1) #0 { +define amdgpu_kernel void @test_strcmp(i8* %src0, i8* %src1) #0 { %res = call i32 @strcmp(i8* %src0, i8* %src1) store volatile i32 %res, i32 addrspace(1)* undef ret void } ; ERROR: error: :0:0: in function test_strlen void (i8*): unsupported call to function strlen -define void @test_strlen(i8* %src) #0 { +define amdgpu_kernel void @test_strlen(i8* %src) #0 { %res = call i32 @strlen(i8* %src) store volatile i32 %res, i32 addrspace(1)* undef ret void } ; ERROR: error: :0:0: in function test_strnlen void (i8*, i32): unsupported call to function strnlen -define void @test_strnlen(i8* %src, i32 %size) #0 { +define amdgpu_kernel void @test_strnlen(i8* %src, i32 %size) #0 { %res = call i32 @strnlen(i8* %src, i32 %size) store volatile i32 %res, i32 addrspace(1)* undef ret void diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll index 07104ebc8c97..dfd5b97fcc86 100644 --- a/test/CodeGen/AMDGPU/merge-stores.ll +++ b/test/CodeGen/AMDGPU/merge-stores.ll @@ -1,8 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s - -; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s ; This test is mostly to test DAG store merging, so disable the vectorizer. ; Run with devices with different unaligned load restrictions. @@ -16,7 +13,7 @@ ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 123, i8 addrspace(1)* %out.gep.1 @@ -28,7 +25,7 @@ define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 123, i8 addrspace(1)* %out.gep.1 @@ -38,7 +35,7 @@ define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %o ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: ; GCN: buffer_store_dword v -define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 123, i16 addrspace(1)* %out.gep.1 @@ -48,7 +45,7 @@ define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: ; GCN: buffer_store_dword v -define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 0, i16 addrspace(1)* %out.gep.1 @@ -60,7 +57,7 @@ define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { ; GCN: buffer_store_short ; GCN: buffer_store_short ; GCN: s_endpgm -define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 store i16 123, i16 addrspace(1)* %out.gep.1 @@ -72,7 +69,7 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 store i32 123, i32 addrspace(1)* %out.gep.1 @@ -82,7 +79,7 @@ define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: ; GCN: buffer_store_dwordx2 -define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* store float 1.0, float addrspace(1)* %out.gep.1.bc @@ -94,7 +91,7 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* store i32 123, i32 addrspace(1)* %out.gep.1.bc @@ -108,7 +105,7 @@ define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -122,7 +119,7 @@ define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: ; GCN: buffer_store_dwordx4 -define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -137,7 +134,7 @@ define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) ; First store is out of order. ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: ; GCN: buffer_store_dwordx4 -define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -150,14 +147,9 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { } ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: -; GCN-NOAA: buffer_store_dwordx4 v - -; GCN-AA: buffer_store_dwordx2 -; GCN-AA: buffer_store_dword v -; GCN-AA: buffer_store_dword v - +; GCN-AA: buffer_store_dwordx4 v ; GCN: s_endpgm -define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -177,7 +169,7 @@ define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %o ; SI-DAG: buffer_store_dword ; SI-NOT: buffer_store_dword ; GCN: s_endpgm -define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 @@ -189,7 +181,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: ; GCN: buffer_store_dwordx4 -define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 store i64 123, i64 addrspace(1)* %out.gep.1 @@ -200,7 +192,7 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 @@ -215,7 +207,7 @@ define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx2 [[LOAD]] -define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -230,7 +222,7 @@ define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 @@ -249,7 +241,7 @@ define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace( ; GCN: buffer_load_dword v ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v -define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -264,7 +256,7 @@ define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* % ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -291,7 +283,7 @@ define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 ; SI-DAG: buffer_store_dword v ; SI-DAG: buffer_store_dwordx2 v ; GCN: s_endpgm -define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -310,7 +302,7 @@ define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 @@ -333,7 +325,7 @@ define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, f ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 -define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 @@ -359,7 +351,7 @@ define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace( ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] ; GCN: s_barrier ; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -396,7 +388,7 @@ define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* % ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v -define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -424,7 +416,7 @@ define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* % ; GCN: buffer_load_dword [[LOAD:v[0-9]+]] ; GCN: buffer_store_dword [[LOAD]] ; GCN: s_endpgm -define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 @@ -454,7 +446,7 @@ define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 ad ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: s_endpgm -define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 @@ -474,19 +466,11 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1 ret void } -; This works once AA is enabled on the subtarget ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] - -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v - -; GCN-AA: buffer_store_dwordx4 [[LOAD]] - +; GCN: buffer_store_dwordx4 [[LOAD]] ; GCN: s_endpgm -define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 @@ -508,7 +492,7 @@ define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out ; GCN: ds_write_b8 ; GCN: ds_write_b8 ; GCN: s_endpgm -define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 store i8 123, i8 addrspace(3)* %out.gep.1 @@ -520,7 +504,7 @@ define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} -define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 store i32 123, i32 addrspace(3)* %out.gep.1 @@ -538,7 +522,7 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 ; GCN: s_endpgm -define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { +define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 @@ -556,7 +540,7 @@ define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} ; GCN: buffer_store_dword v[[HI]] -define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { store i32 9, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 12, i32 addrspace(1)* %idx1, align 4 @@ -572,7 +556,7 @@ define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx2 -define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { store i32 13, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 15, i32 addrspace(1)* %idx1, align 4 @@ -591,7 +575,7 @@ define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx2 ; GCN: buffer_store_dword v -define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 999, i32 addrspace(1)* %idx1, align 4 @@ -612,7 +596,7 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { +define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 999, i32 addrspace(1)* %idx1, align 4 @@ -646,7 +630,7 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 ; GCN: ScratchSize: 0{{$}} -define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 store <3 x i32> %vec, <3 x i32> addrspace(1)* %out ret void @@ -662,7 +646,7 @@ define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> a ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} ; GCN: ScratchSize: 0{{$}} -define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 store <3 x i64> %vec, <3 x i64> addrspace(1)* %out ret void @@ -678,7 +662,7 @@ define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> a ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 ; GCN: ScratchSize: 0{{$}} -define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 %fadd = fadd <3 x float> %vec, store <3 x float> %fadd, <3 x float> addrspace(1)* %out @@ -695,7 +679,7 @@ define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x floa ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} ; GCN: ScratchSize: 0{{$}} -define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 %fadd = fadd <3 x double> %vec, store <3 x double> %fadd, <3 x double> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll index 19d0117d64a9..e85a724c1567 100644 --- a/test/CodeGen/AMDGPU/min.ll +++ b/test/CodeGen/AMDGPU/min.ll @@ -1,17 +1,22 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_test_imin_sle_i32: ; GCN: v_min_i32_e32 ; EG: MIN_INT -define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 +define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 %cmp = icmp sle i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %val, i32 addrspace(1)* %out.gep, align 4 ret void } @@ -19,7 +24,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp sle i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -30,7 +35,7 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { +define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { %cmp = icmp sle <1 x i32> %a, %b %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b store <1 x i32> %val, <1 x i32> addrspace(1)* %out @@ -47,7 +52,7 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, < ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { +define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { %cmp = icmp sle <4 x i32> %a, %b %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b store <4 x i32> %val, <4 x i32> addrspace(1)* %out @@ -60,7 +65,7 @@ define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, < ; GCN: s_sext_i32_i8 ; GCN: s_sext_i32_i8 ; GCN: s_min_i32 -define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind { +define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 { %cmp = icmp sle i8 %a, %b %val = select i1 %cmp, i8 %a, i8 %b store i8 %val, i8 addrspace(1)* %out @@ -90,30 +95,62 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind { ; VI: v_min_i32 ; VI: v_min_i32 +; GFX9: v_min_i16 +; GFX9: v_min_i16 +; GFX9: v_min_i16 +; GFX9: v_min_i16 + ; GCN: s_endpgm ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind { +define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 { %cmp = icmp sle <4 x i8> %a, %b %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %val, <4 x i8> addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16: +; SI: v_min_i32 +; SI: v_min_i32 + +; VI: v_min_i32 +; VI: v_min_i32 + +; GFX9: v_pk_min_i16 + +; EG: MIN_INT +; EG: MIN_INT +define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { + %cmp = icmp sle <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: VI use s_min_i32 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 +; VI: v_min_i32 +; VI: v_min_i32 +; VI: v_min_i32 +; VI: v_min_i32 + +; GFX9: v_pk_min_i16 +; GFX9: v_pk_min_i16 + ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { +define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 { %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b store <4 x i16> %val, <4 x i16> addrspace(1)* %out @@ -124,12 +161,36 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, < ; GCN: v_min_i32_e32 ; EG: MIN_INT -define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 +define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 %cmp = icmp slt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %val, i32 addrspace(1)* %out.gep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_imin_slt_i16 +; SI: v_min_i32_e32 + +; GFX89: v_min_i16_e32 + +; EG: MIN_INT +define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + + %a = load i16, i16 addrspace(1)* %a.gep + %b = load i16, i16 addrspace(1)* %b.gep + %cmp = icmp slt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out.gep ret void } @@ -137,7 +198,7 @@ define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp slt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -150,7 +211,7 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { +define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b store <2 x i32> %val, <2 x i32> addrspace(1)* %out @@ -161,7 +222,7 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, < ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { %cmp = icmp slt i32 %a, 8 %val = select i1 %cmp, i32 %a, i32 8 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -172,7 +233,7 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { %cmp = icmp sle i32 %a, 8 %val = select i1 %cmp, i32 %a, i32 8 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -183,12 +244,16 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; GCN: v_min_u32_e32 ; EG: MIN_UINT -define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 +define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 %cmp = icmp ule i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %val, i32 addrspace(1)* %out.gep, align 4 ret void } @@ -196,25 +261,65 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; GCN: v_min_u32_e32 ; GCN: v_min_u32_e32 ; GCN: v_min_u32_e32 -; SI-NOT: v_min_u32_e32 +; GCN-NOT: v_min_u32_e32 ; GCN: s_endpgm ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind { - %a = load <3 x i32>, <3 x i32> addrspace(1)* %aptr - %b = load <3 x i32>, <3 x i32> addrspace(1)* %bptr +define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid + + %a = load <3 x i32>, <3 x i32> addrspace(1)* %a.gep + %b = load <3 x i32>, <3 x i32> addrspace(1)* %b.gep %cmp = icmp ule <3 x i32> %a, %b %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b - store <3 x i32> %val, <3 x i32> addrspace(1)* %out + store <3 x i32> %val, <3 x i32> addrspace(1)* %out.gep + ret void +} + +; FIXME: Reduce unused packed component to scalar +; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}} +; SI: v_min_u32_e32 +; SI: v_min_u32_e32 +; SI: v_min_u32_e32 +; SI-NOT: v_min_u32_e32 + +; VI: v_min_u16_e32 +; VI: v_min_u16_sdwa +; VI: v_min_u16_e32 +; VI-NOT: v_min_u16_e32 + +; GFX9: v_pk_min_u16 +; GFX9: v_pk_min_u16 + +; GCN: s_endpgm + +; EG: MIN_UINT +; EG: MIN_UINT +; EG: MIN_UINT +define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid + + %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.gep + %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.gep + %cmp = icmp ule <3 x i16> %a, %b + %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b + store <3 x i16> %val, <3 x i16> addrspace(1)* %out.gep ret void } + ; FUNC-LABEL: @s_test_umin_ule_i32 ; GCN: s_min_u32 ; EG: MIN_UINT -define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp ule i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -225,27 +330,40 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; GCN: v_min_u32_e32 ; EG: MIN_UINT -define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 +define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 %cmp = icmp ult i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %val, i32 addrspace(1)* %out.gep, align 4 ret void } ; FUNC-LABEL: {{^}}v_test_umin_ult_i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: v_min_u32_e32 +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: v_min_u32_e32 + +; GFX89: flat_load_ubyte +; GFX89: flat_load_ubyte +; GFX89: v_min_u16_e32 ; EG: MIN_UINT -define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { - %a = load i8, i8 addrspace(1)* %aptr, align 1 - %b = load i8, i8 addrspace(1)* %bptr, align 1 +define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %tid + + %a = load i8, i8 addrspace(1)* %a.gep, align 1 + %b = load i8, i8 addrspace(1)* %b.gep, align 1 %cmp = icmp ult i8 %a, %b %val = select i1 %cmp, i8 %a, i8 %b - store i8 %val, i8 addrspace(1)* %out, align 1 + store i8 %val, i8 addrspace(1)* %out.gep, align 1 ret void } @@ -253,7 +371,7 @@ define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i ; GCN: s_min_u32 ; EG: MIN_UINT -define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp ult i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -268,7 +386,7 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; GCN: s_endpgm ; EG-NOT: MIN_UINT -define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp ult i32 %a, %b @@ -286,7 +404,7 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace ; GCN: s_endpgm ; EG-NOT: MIN_UINT -define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { %a = load i16, i16 addrspace(1)* %aptr, align 2 %b = load i16, i16 addrspace(1)* %bptr, align 2 %cmp = icmp ult i16 %a, %b @@ -301,7 +419,7 @@ define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace ; GCN: s_min_u32 ; EG: MIN_UINT -define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { +define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { %cmp = icmp ult <1 x i32> %a, %b %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b store <1 x i32> %val, <1 x i32> addrspace(1)* %out @@ -326,7 +444,7 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, < ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { +define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 { %cmp = icmp ult <8 x i32> %a, %b %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b store <8 x i32> %val, <8 x i32> addrspace(1)* %out @@ -334,14 +452,23 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, < } ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 + +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 ; EG: MIN_UINT ; EG: MIN_UINT @@ -351,7 +478,7 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, < ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { +define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 { %cmp = icmp ult <8 x i16> %a, %b %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b store <8 x i16> %val, <8 x i16> addrspace(1)* %out @@ -367,7 +494,7 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, < ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_UINT -define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { +define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 %cmp = icmp ult i32 %a.ext, %b.ext @@ -387,7 +514,7 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1 ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_INT -define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { +define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 %cmp = icmp slt i32 %a.ext, %b.ext @@ -402,7 +529,7 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { +define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 { %cmp = icmp sle i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b store i16 %val, i16 addrspace(1)* %out @@ -415,7 +542,7 @@ define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin ; EG: MIN_UINT ; EG: MIN_UINT -define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp ult i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -427,7 +554,7 @@ define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind ; EG: MIN_UINT ; EG: MIN_UINT -define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp ule i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -439,7 +566,7 @@ define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind ; EG-DAG: MIN_UINT ; EG-DAG: MIN_INT -define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp slt i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -451,9 +578,63 @@ define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind ; EG-DAG: MIN_UINT ; EG-DAG: MIN_INT -define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp sle i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 ret void } + +; FUNC-LABEL: {{^}}v_test_imin_sle_v2i16: +; SI: v_min_i32 +; SI: v_min_i32 + +; VI: v_min_i16 +; VI: v_min_i16 + +; GFX9: v_pk_min_i16 + +; EG: MIN_INT +; EG: MIN_INT +define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep + %cmp = icmp sle <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; FIXME: i16 min +; FUNC-LABEL: {{^}}v_test_imin_ule_v2i16: +; SI: v_min_u32 +; SI: v_min_u32 + +; VI: v_min_u16 +; VI: v_min_u16 + +; GFX9: v_pk_min_u16 + +; EG: MIN_UINT +; EG: MIN_UINT +define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep + %cmp = icmp ule <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll index 728479ad9f62..59d5d2cdb1aa 100644 --- a/test/CodeGen/AMDGPU/min3.ll +++ b/test/CodeGen/AMDGPU/min3.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; FUNC-LABEL: @v_test_imin3_slt_i32 ; SI: v_min3_i32 -define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -23,7 +23,7 @@ define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt ; FUNC-LABEL: @v_test_umin3_ult_i32 ; SI: v_min3_u32 -define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -43,7 +43,7 @@ define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt ; FUNC-LABEL: @v_test_umin_umin_umin ; SI: v_min_i32 ; SI: v_min3_i32 -define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %tid2 = mul i32 %tid, 2 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid @@ -77,7 +77,7 @@ define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %ap ; FUNC-LABEL: @v_test_umin3_2_uses ; SI-NOT: v_min3 -define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { +define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %tid2 = mul i32 %tid, 2 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid diff --git a/test/CodeGen/AMDGPU/missing-store.ll b/test/CodeGen/AMDGPU/missing-store.ll index 8e1b0036a1af..83c2a911a5ce 100644 --- a/test/CodeGen/AMDGPU/missing-store.ll +++ b/test/CodeGen/AMDGPU/missing-store.ll @@ -15,7 +15,7 @@ ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} ; SI: buffer_store_dword ; SI: s_endpgm -define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8 %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 diff --git a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll index 85dfbe6b8a33..e1fb00a1de30 100644 --- a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -19,7 +19,7 @@ ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] ; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, -define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { +define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { bb: %tmp = icmp sgt i32 %arg3, 0 br i1 %tmp, label %bb4, label %bb17 diff --git a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll index 1a0a39027853..417b4ba802e1 100644 --- a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll +++ b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -11,7 +11,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; GCN-LABEL: {{^}}atomic_max_i32: ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}} -define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { +define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep @@ -31,7 +31,7 @@ exit: ; GCN-LABEL: {{^}}atomic_max_i32_noret: ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}} -define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { +define amdgpu_kernel void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll index a574365da986..9e1d2e0490c7 100644 --- a/test/CodeGen/AMDGPU/mubuf.ll +++ b/test/CodeGen/AMDGPU/mubuf.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone ; MUBUF load with an immediate byte offset that fits into 12-bits ; CHECK-LABEL: {{^}}mubuf_load0: ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0 -define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1 %1 = load i32, i32 addrspace(1)* %0 @@ -20,7 +20,7 @@ entry: ; MUBUF load with the largest possible immediate offset ; CHECK-LABEL: {{^}}mubuf_load1: ; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0 -define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095 %1 = load i8, i8 addrspace(1)* %0 @@ -32,7 +32,7 @@ entry: ; CHECK-LABEL: {{^}}mubuf_load2: ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0 -define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024 %1 = load i32, i32 addrspace(1)* %0 @@ -44,7 +44,7 @@ entry: ; CHECK-LABEL: {{^}}mubuf_load3: ; CHECK-NOT: ADD ; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0 -define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) { +define amdgpu_kernel void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) { entry: %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 @@ -91,7 +91,7 @@ main_body: ; MUBUF store with an immediate byte offset that fits into 12-bits ; CHECK-LABEL: {{^}}mubuf_store0: ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0 -define void @mubuf_store0(i32 addrspace(1)* %out) { +define amdgpu_kernel void @mubuf_store0(i32 addrspace(1)* %out) { entry: %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1 store i32 0, i32 addrspace(1)* %0 @@ -102,7 +102,7 @@ entry: ; CHECK-LABEL: {{^}}mubuf_store1: ; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0 -define void @mubuf_store1(i8 addrspace(1)* %out) { +define amdgpu_kernel void @mubuf_store1(i8 addrspace(1)* %out) { entry: %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095 store i8 0, i8 addrspace(1)* %0 @@ -113,7 +113,7 @@ entry: ; CHECK-LABEL: {{^}}mubuf_store2: ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0 -define void @mubuf_store2(i32 addrspace(1)* %out) { +define amdgpu_kernel void @mubuf_store2(i32 addrspace(1)* %out) { entry: %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024 store i32 0, i32 addrspace(1)* %0 @@ -124,7 +124,7 @@ entry: ; CHECK-LABEL: {{^}}mubuf_store3: ; CHECK-NOT: ADD ; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0 -define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) { +define amdgpu_kernel void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) { entry: %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 @@ -134,14 +134,14 @@ entry: ; CHECK-LABEL: {{^}}store_sgpr_ptr: ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 -define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 { store i32 99, i32 addrspace(1)* %out, align 4 ret void } ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset: ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40 -define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10 store i32 99, i32 addrspace(1)* %out.gep, align 4 ret void @@ -150,7 +150,7 @@ define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset: ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 store i32 99, i32 addrspace(1)* %out.gep, align 4 ret void @@ -159,7 +159,7 @@ define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic: ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst ret void @@ -167,7 +167,7 @@ define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { ; CHECK-LABEL: {{^}}store_vgpr_ptr: ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 -define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid store i32 99, i32 addrspace(1)* %out.gep, align 4 diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll index 7910b70d8cf2..a72a6efb0711 100644 --- a/test/CodeGen/AMDGPU/mul.ll +++ b/test/CodeGen/AMDGPU/mul.ll @@ -11,7 +11,7 @@ ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -31,7 +31,7 @@ define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1) ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -45,7 +45,7 @@ define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* % ; SI: s_load_dword ; SI: s_mul_i32 ; SI: buffer_store_dword -define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { %mul = mul i64 %b, %a %trunc = trunc i64 %mul to i32 store i32 %trunc, i32 addrspace(1)* %out, align 8 @@ -57,7 +57,7 @@ define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { ; SI: s_load_dword ; SI: v_mul_lo_i32 ; SI: buffer_store_dword -define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 %mul = mul i64 %b, %a @@ -73,7 +73,7 @@ define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %a ; EG-DAG: MULHI_INT ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_i32 -define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { entry: %0 = sext i32 %in to i64 %1 = mul i64 %0, 80 @@ -87,7 +87,7 @@ entry: ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_hi_i32 ; SI: s_endpgm -define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { %val = load i32, i32 addrspace(1)* %in, align 4 %ext = sext i32 %val to i64 %mul = mul i64 %ext, 80 @@ -99,7 +99,7 @@ define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 ; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 ; SI: s_endpgm -define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { %val = load i32, i32 addrspace(1)* %in, align 4 %ext = sext i32 %val to i64 %mul = mul i64 %ext, 9 @@ -114,7 +114,7 @@ define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* % ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], ; SI: s_endpgm -define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %mul = mul i32 %a, %b store i32 %mul, i32 addrspace(1)* %out, align 4 ret void @@ -122,7 +122,7 @@ define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; FUNC-LABEL: {{^}}v_mul_i32: ; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -139,7 +139,7 @@ define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; crash with a 'failed to select' error. ; FUNC-LABEL: {{^}}s_mul_i64: -define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %mul = mul i64 %a, %b store i64 %mul, i64 addrspace(1)* %out, align 8 ret void @@ -147,7 +147,7 @@ define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; FUNC-LABEL: {{^}}v_mul_i64: ; SI: v_mul_lo_i32 -define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { +define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 %mul = mul i64 %a, %b @@ -157,7 +157,7 @@ define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr ; FUNC-LABEL: {{^}}mul32_in_branch: ; SI: s_mul_i32 -define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) { entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else @@ -180,7 +180,7 @@ endif: ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_u32 ; SI: s_endpgm -define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else @@ -224,7 +224,7 @@ endif: ; SI: s_mul_i32 ; SI: buffer_store_dwordx4 -define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 { +define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 { %mul = mul i128 %a, %b store i128 %mul, i128 addrspace(1)* %out ret void @@ -234,26 +234,26 @@ define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 { ; SI: {{buffer|flat}}_load_dwordx4 ; SI: {{buffer|flat}}_load_dwordx4 -; SI: v_mul_lo_i32 -; SI: v_mul_hi_u32 -; SI: v_mul_hi_u32 -; SI: v_mul_lo_i32 -; SI: v_mul_hi_u32 -; SI: v_mul_hi_u32 -; SI: v_mul_lo_i32 -; SI: v_mul_lo_i32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_lo_i32 ; SI: v_add_i32_e32 -; SI: v_mul_hi_u32 -; SI: v_mul_lo_i32 -; SI: v_mul_hi_u32 -; SI: v_mul_lo_i32 -; SI: v_mul_lo_i32 -; SI: v_mul_lo_i32 -; SI: v_mul_lo_i32 -; SI: v_mul_lo_i32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_lo_i32 ; SI: {{buffer|flat}}_store_dwordx4 -define void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 { +define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid %gep.b = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid diff --git a/test/CodeGen/AMDGPU/mul_int24.ll b/test/CodeGen/AMDGPU/mul_int24.ll index 6f7dfe2e13eb..3137569e9ca7 100644 --- a/test/CodeGen/AMDGPU/mul_int24.ll +++ b/test/CodeGen/AMDGPU/mul_int24.ll @@ -13,7 +13,7 @@ ; Make sure we are not masking the inputs ; CM-NOT: AND ; CM: MUL_INT24 -define void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %a.shl = shl i32 %a, 8 %a.24 = ashr i32 %a.shl, 8 @@ -39,7 +39,7 @@ entry: ; CM: MULHI_INT24 ; CM: MULHI_INT24 ; CM: MULHI_INT24 -define void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %a.shl = shl i32 %a, 8 %a.24 = ashr i32 %a.shl, 8 @@ -70,7 +70,7 @@ entry: ; GCN-DAG: v_mul_i32_i24_e32 ; GCN: buffer_store_dwordx2 -define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %shl.i = shl i32 %a, 8 %shr.i = ashr i32 %shl.i, 8 %conv.i = sext i32 %shr.i to i64 @@ -87,7 +87,7 @@ define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GCN-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]] ; GCN-DAG: v_mul_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]] ; GCN: buffer_store_dwordx2 -define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %shl.i = shl i32 %a, 8 %shr.i = ashr i32 %shl.i, 8 %conv.i = sext i32 %shr.i to i64 @@ -112,7 +112,7 @@ define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 -define void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 { +define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 { entry: %a.shl = shl i33 %a, 9 %a.24 = ashr i33 %a.shl, 9 @@ -133,7 +133,7 @@ entry: ; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]], ; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] ; SI-NEXT: buffer_store_dword v[[HI]] -define void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { +define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { entry: %tmp0 = shl i33 %a, 9 %a_24 = ashr i33 %tmp0, 9 @@ -151,7 +151,7 @@ entry: ; GCN: v_mul_i32_i24_e32 v[[VAL_LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[VAL_HI:[0-9]+]], v[[VAL_LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} -define void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { +define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { bb: %cmp = icmp eq i32 %arg0, 0 br i1 %cmp, label %bb11, label %bb7 diff --git a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 004d36f00e51..59fdc8be5cea 100644 --- a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; FUNC-LABEL: {{^}}test_umul24_i32: ; GCN: v_mul_u32_u24 -define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %0 = shl i32 %a, 8 %a_24 = lshr i32 %0, 8 @@ -22,7 +22,7 @@ entry: ; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16 ; VI: s_mul_i32 [[SI_MUL:s[0-9]]], s{{[0-9]}}, s{{[0-9]}} ; VI: s_sext_i32_i16 s{{[0-9]}}, [[SI_MUL]] -define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { +define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b %ext = sext i16 %mul to i32 @@ -34,7 +34,7 @@ entry: ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 -define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x @@ -54,7 +54,7 @@ define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* ; VI: s_mul_i32 ; VI: s_and_b32 ; VI: v_mov_b32_e32 -define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) { +define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b %ext = zext i16 %mul to i32 @@ -66,7 +66,7 @@ entry: ; SI: v_mul_u32_u24_e32 ; SI: v_and_b32_e32 ; VI: v_mul_lo_u16 -define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x @@ -83,7 +83,7 @@ define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 -define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) { +define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) { entry: %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -101,7 +101,7 @@ entry: ; GCN-NOT: and ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], ; GCN-NEXT: buffer_store_dword [[RESULT]] -define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %a.24 = and i32 %a, 16777215 %b.24 = and i32 %b, 16777215 @@ -118,7 +118,7 @@ entry: ; GCN-NOT: and ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], ; GCN-NEXT: buffer_store_dword [[RESULT]] -define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { entry: %a.24 = and i64 %a, 16777215 %b.24 = and i64 %b, 16777215 @@ -136,7 +136,7 @@ entry: ; GCN-DAG: v_mul_u32_u24_e32 ; GCN-DAG: v_mul_hi_u32_u24_e32 ; GCN: buffer_store_dwordx2 -define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: %tmp0 = shl i64 %a, 40 %a_24 = lshr i64 %tmp0, 40 @@ -152,7 +152,7 @@ entry: ; GCN-NOT: s_and_b32 ; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] ; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] -define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) { entry: %tmp0 = shl i64 %a, 40 %a.24 = lshr i64 %tmp0, 40 @@ -166,7 +166,7 @@ entry: ; GCN: s_and_b32 ; GCN: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]] ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]] -define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) { entry: %a.16 = and i32 %a, 65535 %b.16 = and i32 %b, 65535 @@ -186,7 +186,7 @@ entry: ; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], ; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}} -define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) { +define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) { entry: %tmp0 = shl i33 %a, 9 %a_24 = lshr i33 %tmp0, 9 @@ -206,7 +206,7 @@ entry: ; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], ; GCN-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] ; GCN-NEXT: buffer_store_dword v[[HI]] -define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { +define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { entry: %tmp0 = shl i33 %a, 9 %a_24 = lshr i33 %tmp0, 9 diff --git a/test/CodeGen/AMDGPU/mul_uint24-r600.ll b/test/CodeGen/AMDGPU/mul_uint24-r600.ll index da1c111fa5c0..0a646b7126d0 100644 --- a/test/CodeGen/AMDGPU/mul_uint24-r600.ll +++ b/test/CodeGen/AMDGPU/mul_uint24-r600.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}test_umul24_i32: ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W -define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %0 = shl i32 %a, 8 %a_24 = lshr i32 %0, 8 @@ -19,7 +19,7 @@ entry: ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x ; EG: 16 -define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { +define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %mul = mul i16 %a, %b %ext = sext i16 %mul to i32 @@ -31,7 +31,7 @@ entry: ; FUNC-LABEL: {{^}}test_umul24_i8: ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x -define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) { +define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) { entry: %mul = mul i8 %a, %b %ext = sext i8 %mul to i32 @@ -41,7 +41,7 @@ entry: ; FUNC-LABEL: {{^}}test_umulhi24_i32_i64: ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W -define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %a.24 = and i32 %a, 16777215 %b.24 = and i32 %b, 16777215 @@ -56,7 +56,7 @@ entry: ; FUNC-LABEL: {{^}}test_umulhi24: ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y -define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { entry: %a.24 = and i64 %a, 16777215 %b.24 = and i64 %b, 16777215 @@ -71,7 +71,7 @@ entry: ; FUNC-LABEL: {{^}}test_umul24_i64: ; EG; MUL_UINT24 ; EG: MULHI -define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: %tmp0 = shl i64 %a, 40 %a_24 = lshr i64 %tmp0, 40 diff --git a/test/CodeGen/AMDGPU/mulhu.ll b/test/CodeGen/AMDGPU/mulhu.ll deleted file mode 100644 index 29b0944a5533..000000000000 --- a/test/CodeGen/AMDGPU/mulhu.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab -;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} -;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 - -define void @test(i32 %p) { - %i = udiv i32 %p, 3 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll new file mode 100644 index 000000000000..9d0b6b395996 --- /dev/null +++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -0,0 +1,710 @@ +; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Add an extra verifier runs. There were some cases where invalid IR +; was produced but happened to be fixed by the later passes. + +; Make sure divergent control flow with multiple exits from a region +; is properly handled. UnifyFunctionExitNodes should be run before +; StructurizeCFG. + +; IR-LABEL: @multi_divergent_region_exit_ret_ret( +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: %2 = extractvalue { i1, i64 } %1, 0 +; IR: %3 = extractvalue { i1, i64 } %1, 1 +; IR: br i1 %2, label %LeafBlock1, label %Flow + +; IR: Flow: +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %7 = extractvalue { i1, i64 } %6, 0 +; IR: %8 = extractvalue { i1, i64 } %6, 1 +; IR: br i1 %7, label %LeafBlock, label %Flow1 + +; IR: LeafBlock: +; IR: br label %Flow1 + +; IR: LeafBlock1: +; IR: br label %Flow{{$}} + +; IR: Flow2: +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: %13 = extractvalue { i1, i64 } %12, 0 +; IR: %14 = extractvalue { i1, i64 } %12, 1 +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock + +; IR: exit0: +; IR: store volatile i32 9, i32 addrspace(1)* undef +; IR: br label %UnifiedReturnBlock + +; IR: Flow1: +; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 + +; IR: exit1: +; IR: store volatile i32 17, i32 addrspace(3)* undef +; IR: br label %Flow2 + +; IR: UnifiedReturnBlock: +; IR: call void @llvm.amdgcn.end.cf(i64 %14) +; IR: ret void + + +; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret: +; GCN: v_cmp_lt_i32_e32 vcc, 1 +; GCN: s_and_saveexec_b64 +; GCN: s_xor_b64 + + +; FIXME: Why is this compare essentially repeated? +; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] +; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1 + +; GCN: ; %Flow1 +; GCN-NEXT: s_or_b64 exec, exec +; GCN: v_cmp_ne_u32_e32 vcc, 0 + +; GCN: ; %exit1 +; GCN: ds_write_b32 + +; GCN: %Flow2 +; GCN-NEXT: s_or_b64 exec, exec +; GCN: v_cmp_ne_u32_e32 vcc, 0 +; GCN-NEXT: s_and_saveexec_b64 +; GCN-NEXT: s_xor_b64 + +; GCN: ; %exit0 +; GCN: buffer_store_dword + +; GCN: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void +} + +; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable( +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) + +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) + +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock + + +; IR: UnifiedUnreachableBlock: +; IR-NEXT: unreachable + + +; FIXME: Probably should insert an s_endpgm anyway. +; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable: +; GCN: ; %UnifiedUnreachableBlock +; GCN-NEXT: .Lfunc_end +define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + unreachable + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + unreachable +} + +; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret( +; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2 +; IR: llvm.amdgcn.if +; IR: br i1 + +; IR: {{^}}Flow: +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: br i1 %7, label %LeafBlock, label %Flow1 + +; IR: {{^}}LeafBlock: +; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1 +; IR: %9 = xor i1 %divergent.cond1, true +; IR: br label %Flow1 + +; IR: LeafBlock1: +; IR: %uniform.cond0 = icmp eq i32 %arg3, 2 +; IR: %10 = xor i1 %uniform.cond0, true +; IR: br label %Flow + +; IR: Flow2: +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock + +; IR: exit0: +; IR: store volatile i32 9, i32 addrspace(1)* undef +; IR: br label %UnifiedReturnBlock + +; IR: {{^}}Flow1: +; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 + +; IR: exit1: +; IR: store volatile i32 17, i32 addrspace(3)* undef +; IR: br label %Flow2 + +; IR: UnifiedReturnBlock: +; IR: call void @llvm.amdgcn.end.cf(i64 %14) +; IR: ret void +define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %divergent.cond0 = icmp slt i32 %tmp16, 2 + br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %divergent.cond1 = icmp eq i32 %tmp16, 1 + br i1 %divergent.cond1, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %uniform.cond0 = icmp eq i32 %arg3, 2 + br i1 %uniform.cond0, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void +} + +; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret( +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: br i1 %2, label %LeafBlock1, label %Flow + +; IR: Flow: +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) + +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) + +define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %arg3, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void +} + +; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value( +; IR: Flow2: +; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] +; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %20) + +; IR: UnifiedReturnBlock: +; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %15) +; IR: ret float %UnifiedRetVal +define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 { +entry: + %Pivot = icmp slt i32 %vgpr, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %vgpr, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %vgpr, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store i32 9, i32 addrspace(1)* undef + ret float 1.0 + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store i32 17, i32 addrspace(3)* undef + ret float 2.0 +} + +; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value( + +; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value: +; GCN: s_cmp_gt_i32 s0, 1 +; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]] + +; GCN: v_cmp_ne_u32_e32 vcc, 7, v0 + +; GCN: {{^}}[[FLOW]]: +; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]] + +; GCN: v_mov_b32_e32 v0, 2.0 +; GCN: s_or_b64 exec, exec +; GCN: s_and_b64 exec, exec +; GCN: v_mov_b32_e32 v0, 1.0 + +; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: ; return + +define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 { +entry: + %uniform.cond = icmp slt i32 %sgpr, 2 + br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %divergent.cond0 = icmp eq i32 %vgpr, 3 + br i1 %divergent.cond0, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %divergent.cond1 = icmp eq i32 %vgpr, 7 + br i1 %divergent.cond1, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store i32 9, i32 addrspace(1)* undef + ret float 1.0 + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store i32 17, i32 addrspace(3)* undef + ret float 2.0 +} + +; IR-LABEL: @multi_divergent_region_exit_ret_unreachable( +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) + +; IR: Flow: +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) + +; IR: Flow2: +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock + +; IR: exit0: +; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef +; IR-NEXT: br label %UnifiedReturnBlock + +; IR: Flow1: +; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 + +; IR: exit1: +; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef +; IR-NEXT: call void @llvm.amdgcn.unreachable() +; IR-NEXT: br label %Flow2 + +; IR: UnifiedReturnBlock: +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) +; IR-NEXT: ret void +define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + unreachable +} + +; The non-uniformity of the branch to the exiting blocks requires +; looking at transitive predecessors. + +; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable( + +; IR: exit0: ; preds = %Flow2 +; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef +; IR-NEXT: br label %UnifiedReturnBlock + + +; IR: indirect.exit1: +; IR: %load = load volatile i32, i32 addrspace(1)* undef +; IR: store volatile i32 %load, i32 addrspace(1)* undef +; IR: store volatile i32 9, i32 addrspace(1)* undef +; IR: call void @llvm.amdgcn.unreachable() +; IR-NEXT: br label %Flow2 + +; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2 +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) +; IR-NEXT: ret void +define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + %Pivot = icmp slt i32 %tmp16, 2 + br i1 %Pivot, label %LeafBlock, label %LeafBlock1 + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %indirect.exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void + +indirect.exit1: + %load = load volatile i32, i32 addrspace(1)* undef + store volatile i32 %load, i32 addrspace(1)* undef + br label %exit1 + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + unreachable +} + +; IR-LABEL: @multi_divergent_region_exit_ret_switch( +define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = add i32 0, %tmp + %tmp2 = zext i32 %tmp1 to i64 + %tmp3 = add i64 0, %tmp2 + %tmp4 = shl i64 %tmp3, 32 + %tmp5 = ashr exact i64 %tmp4, 32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = sext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8 + %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4 + %tmp13 = zext i32 %tmp10 to i64 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13 + %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16 + switch i32 %tmp16, label %exit1 + [ i32 1, label %LeafBlock + i32 2, label %LeafBlock1 + i32 3, label %exit0 ] + +LeafBlock: ; preds = %entry + %SwitchLeaf = icmp eq i32 %tmp16, 1 + br i1 %SwitchLeaf, label %exit0, label %exit1 + +LeafBlock1: ; preds = %entry + %SwitchLeaf2 = icmp eq i32 %tmp16, 2 + br i1 %SwitchLeaf2, label %exit0, label %exit1 + +exit0: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 17, i32 addrspace(3)* undef + ret void + +exit1: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 9, i32 addrspace(1)* undef + unreachable +} + +; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle( +define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 { +entry: + %uniform.cond0 = icmp eq i32 %arg0, 4 + br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret + +divergent.multi.exit.region: + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %divergent.cond0 = icmp eq i32 %id.x, 0 + br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1 + +divergent.ret0: + store volatile i32 11, i32 addrspace(3)* undef + ret void + +divergent.ret1: + store volatile i32 42, i32 addrspace(3)* undef + ret void + +uniform.ret: + store volatile i32 9, i32 addrspace(1)* undef + ret void +} + +; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle( +define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 { +entry: + %uniform.cond0 = icmp eq i32 %arg0, 4 + br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret + +divergent.multi.exit.region: + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %divergent.cond0 = icmp eq i32 %id.x, 0 + br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1 + +divergent.if: + %vgpr0 = load volatile float, float addrspace(1)* undef + %divergent.cond1 = fcmp ogt float %vgpr0, 1.0 + br i1 %divergent.cond1, label %divergent.then, label %divergent.endif + +divergent.then: + %vgpr1 = load volatile float, float addrspace(1)* undef + %divergent.cond2 = fcmp olt float %vgpr1, 4.0 + store volatile i32 33, i32 addrspace(1)* undef + br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif + +divergent.endif: + store volatile i32 38, i32 addrspace(1)* undef + br label %divergent.ret0 + +divergent.ret0: + store volatile i32 11, i32 addrspace(3)* undef + ret void + +divergent.ret1: + store volatile i32 42, i32 addrspace(3)* undef + ret void + +uniform.ret: + store volatile i32 9, i32 addrspace(1)* undef + ret void +} + +; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle( +; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region +; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] +; IR: br i1 %8, label %uniform.if, label %Flow2 + +; IR: Flow: ; preds = %uniform.then, %uniform.if +; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ] +; IR: br i1 %11, label %uniform.endif, label %uniform.ret0 + +; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2 +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6) +; IR-NEXT: ret void +define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 { +entry: + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %divergent.cond0 = icmp eq i32 %id.x, 0 + br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret + +uniform.multi.exit.region: + %uniform.cond0 = icmp eq i32 %arg0, 4 + br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1 + +uniform.if: + %sgpr0 = load volatile i32, i32 addrspace(2)* undef + %uniform.cond1 = icmp slt i32 %sgpr0, 1 + br i1 %uniform.cond1, label %uniform.then, label %uniform.endif + +uniform.then: + %sgpr1 = load volatile i32, i32 addrspace(2)* undef + %uniform.cond2 = icmp sge i32 %sgpr1, 4 + store volatile i32 33, i32 addrspace(1)* undef + br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif + +uniform.endif: + store volatile i32 38, i32 addrspace(1)* undef + br label %uniform.ret0 + +uniform.ret0: + store volatile i32 11, i32 addrspace(3)* undef + ret void + +uniform.ret1: + store volatile i32 42, i32 addrspace(3)* undef + ret void + +divergent.ret: + store volatile i32 9, i32 addrspace(1)* undef + ret void +} + +; IR-LABEL: @multi_divergent_unreachable_exit( +; IR: UnifiedUnreachableBlock: +; IR-NEXT: call void @llvm.amdgcn.unreachable() +; IR-NEXT: br label %UnifiedReturnBlock + +; IR: UnifiedReturnBlock: +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 +; IR-NEXT: ret void +define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + switch i32 %tmp, label %bb3 [ + i32 2, label %bb1 + i32 0, label %bb2 + ] + +bb1: ; preds = %bb + unreachable + +bb2: ; preds = %bb + unreachable + +bb3: ; preds = %bb + switch i32 undef, label %bb5 [ + i32 2, label %bb4 + ] + +bb4: ; preds = %bb3 + ret void + +bb5: ; preds = %bb3 + unreachable +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll index 95c7ce862329..15de689b953e 100644 --- a/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/test/CodeGen/AMDGPU/multilevel-break.ll @@ -64,7 +64,7 @@ ENDIF: ; preds = %LOOP br i1 %tmp51, label %LOOP, label %LOOP.outer } -; OPT-LABEL: define void @multi_if_break_loop( +; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop( ; OPT: llvm.amdgcn.break ; OPT: llvm.amdgcn.loop ; OPT: llvm.amdgcn.if.break @@ -79,7 +79,7 @@ ENDIF: ; preds = %LOOP ; Uses a copy intsead of an or ; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]] ; GCN: s_or_b64 [[BREAK_REG]], exec, [[COPY]] -define void @multi_if_break_loop(i32 %arg) #0 { +define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %tmp = sub i32 %id, %arg diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll new file mode 100644 index 000000000000..672549c8ea63 --- /dev/null +++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -0,0 +1,269 @@ +; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; After structurizing, there are 3 levels of loops. The i1 phi +; conditions mutually depend on each other, so it isn't safe to delete +; the condition that appears to have no uses until the loop is +; completely processed. + + +; IR-LABEL: @reduced_nested_loop_conditions( + +; IR: bb5: +; IR-NEXT: %phi.broken = phi i64 [ %loop.phi, %bb10 ], [ 0, %bb ] +; IR-NEXT: %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ] +; IR-NEXT: %tmp7 = icmp eq i32 %tmp6, 1 +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp7) +; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0 +; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1 +; IR-NEXT: br i1 %1, label %bb8, label %Flow + +; IR: bb8: +; IR-NEXT: %3 = call i64 @llvm.amdgcn.break(i64 %phi.broken) +; IR-NEXT: br label %bb13 + +; IR: bb10: +; IR-NEXT: %loop.phi = phi i64 [ %6, %Flow ] +; IR-NEXT: %tmp11 = phi i32 [ %5, %Flow ] +; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) +; IR-NEXT: br i1 %4, label %bb23, label %bb5 + +; IR: Flow: +; IR-NEXT: %loop.phi1 = phi i64 [ %loop.phi2, %bb4 ], [ %phi.broken, %bb5 ] +; IR-NEXT: %5 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ] +; IR-NEXT: %6 = call i64 @llvm.amdgcn.else.break(i64 %2, i64 %loop.phi1) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %2) +; IR-NEXT: br label %bb10 + +; IR: bb13: +; IR-NEXT: %loop.phi3 = phi i64 [ %loop.phi4, %bb3 ], [ %3, %bb8 ] +; IR-NEXT: %tmp14 = phi i1 [ false, %bb3 ], [ true, %bb8 ] +; IR-NEXT: %tmp15 = bitcast i64 %tmp2 to <2 x i32> +; IR-NEXT: br i1 %tmp14, label %bb16, label %bb20 + +; IR: bb16: +; IR-NEXT: %tmp17 = extractelement <2 x i32> %tmp15, i64 1 +; IR-NEXT: %tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17 +; IR-NEXT: %tmp19 = load volatile i32, i32 addrspace(3)* %tmp18 +; IR-NEXT: br label %bb20 + +; IR: bb20: +; IR-NEXT: %loop.phi4 = phi i64 [ %phi.broken, %bb16 ], [ %phi.broken, %bb13 ] +; IR-NEXT: %loop.phi2 = phi i64 [ %phi.broken, %bb16 ], [ %loop.phi3, %bb13 ] +; IR-NEXT: %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ] +; IR-NEXT: br label %bb9 + +; IR: bb23: +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; IR-NEXT: ret void + +; GCN-LABEL: {{^}}reduced_nested_loop_conditions: + +; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cbranch_scc1 + +; FIXME: Should fold to unconditional branch? +; GCN: s_mov_b64 vcc, -1 +; GCN-NEXT: ; implicit-def +; GCN: s_cbranch_vccz + +; GCN: ds_read_b32 + +; GCN: [[BB9:BB[0-9]+_[0-9]+]]: ; %bb9 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_branch [[BB9]] +define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* nocapture %arg) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %tmp + %tmp2 = load volatile i64, i64 addrspace(3)* %tmp1 + br label %bb5 + +bb3: ; preds = %bb9 + br i1 true, label %bb4, label %bb13 + +bb4: ; preds = %bb3 + br label %bb10 + +bb5: ; preds = %bb10, %bb + %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ] + %tmp7 = icmp eq i32 %tmp6, 1 + br i1 %tmp7, label %bb8, label %bb10 + +bb8: ; preds = %bb5 + br label %bb13 + +bb9: ; preds = %bb20, %bb9 + br i1 false, label %bb3, label %bb9 + +bb10: ; preds = %bb5, %bb4 + %tmp11 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ] + %tmp12 = phi i1 [ %tmp22, %bb4 ], [ true, %bb5 ] + br i1 %tmp12, label %bb23, label %bb5 + +bb13: ; preds = %bb8, %bb3 + %tmp14 = phi i1 [ %tmp22, %bb3 ], [ true, %bb8 ] + %tmp15 = bitcast i64 %tmp2 to <2 x i32> + br i1 %tmp14, label %bb16, label %bb20 + +bb16: ; preds = %bb13 + %tmp17 = extractelement <2 x i32> %tmp15, i64 1 + %tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17 + %tmp19 = load volatile i32, i32 addrspace(3)* %tmp18 + br label %bb20 + +bb20: ; preds = %bb16, %bb13 + %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ] + %tmp22 = phi i1 [ false, %bb16 ], [ %tmp14, %bb13 ] + br label %bb9 + +bb23: ; preds = %bb10 + ret void +} + +; Earlier version of above, before a run of the structurizer. +; IR-LABEL: @nested_loop_conditions( + +; IR: Flow7: +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17) +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15) +; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0 +; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1 +; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8 + +; IR: Flow1: +; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ] +; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ] +; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ] +; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) +; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17) +; IR-NEXT: br i1 %18, label %Flow7, label %bb14 + +; IR: Flow2: +; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ] +; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ] +; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ] +; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ] +; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23) +; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0 +; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1 +; IR-NEXT: br i1 %25, label %bb21, label %Flow3 + +; IR: bb21: +; IR: %tmp12 = icmp slt i32 %tmp11, 9 +; IR-NEXT: %27 = xor i1 %tmp12, true +; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken) +; IR-NEXT: br label %Flow3 + +; IR: Flow3: +; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ] +; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ] +; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ] +; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ] +; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ] +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26) +; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4 + +; IR: bb31: +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7) +; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef +; IR-NEXT: ret void + + +; GCN-LABEL: {{^}}nested_loop_conditions: + +; GCN: v_cmp_lt_i32_e32 vcc, 8, v +; GCN: s_and_b64 vcc, exec, vcc +; GCN: s_cbranch_vccnz [[BB31:BB[0-9]+_[0-9]+]] + +; GCN: [[BB14:BB[0-9]+_[0-9]+]]: ; %bb14 +; GCN: v_cmp_ne_u32_e32 vcc, 1, v +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz [[BB31]] + +; GCN: [[BB18:BB[0-9]+_[0-9]+]]: ; %bb18 +; GCN: buffer_load_dword +; GCN: v_cmp_lt_i32_e32 vcc, 8, v +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz [[BB18]] + +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: v_cmp_gt_i32_e32 vcc, 9 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz [[BB14]] + +; GCN: [[BB31]]: +; GCN: buffer_store_dword +; GCN: s_endpgm +define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %arg) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tmp1 = zext i32 %tmp to i64 + %tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp1 + %tmp3 = load i64, i64 addrspace(1)* %tmp2, align 16 + %tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16 + %tmp1033 = extractelement <4 x i32> %tmp932, i64 0 + %tmp1134 = load volatile i32, i32 addrspace(1)* undef + %tmp1235 = icmp slt i32 %tmp1134, 9 + br i1 %tmp1235, label %bb14.lr.ph, label %bb13 + +bb14.lr.ph: ; preds = %bb + br label %bb14 + +bb4.bb13_crit_edge: ; preds = %bb21 + br label %bb13 + +bb13: ; preds = %bb4.bb13_crit_edge, %bb + br label %bb31 + +bb14: ; preds = %bb21, %bb14.lr.ph + %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %tmp10, %bb21 ] + %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %tmp9, %bb21 ] + %tmp15 = icmp eq i32 %tmp1037, 1 + br i1 %tmp15, label %bb16, label %bb31.loopexit + +bb16: ; preds = %bb14 + %tmp17 = bitcast i64 %tmp3 to <2 x i32> + br label %bb18 + +bb18: ; preds = %bb18, %bb16 + %tmp19 = load volatile i32, i32 addrspace(1)* undef + %tmp20 = icmp slt i32 %tmp19, 9 + br i1 %tmp20, label %bb21, label %bb18 + +bb21: ; preds = %bb18 + %tmp22 = extractelement <2 x i32> %tmp17, i64 1 + %tmp23 = lshr i32 %tmp22, 16 + %tmp24 = select i1 undef, i32 undef, i32 %tmp23 + %tmp25 = uitofp i32 %tmp24 to float + %tmp26 = fmul float %tmp25, 0x3EF0001000000000 + %tmp27 = fsub float %tmp26, undef + %tmp28 = fcmp olt float %tmp27, 5.000000e-01 + %tmp29 = select i1 %tmp28, i64 1, i64 2 + %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29 + %tmp7 = zext i32 %tmp30 to i64 + %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7 + %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16 + %tmp10 = extractelement <4 x i32> %tmp9, i64 0 + %tmp11 = load volatile i32, i32 addrspace(1)* undef + %tmp12 = icmp slt i32 %tmp11, 9 + br i1 %tmp12, label %bb14, label %bb4.bb13_crit_edge + +bb31.loopexit: ; preds = %bb14 + br label %bb31 + +bb31: ; preds = %bb31.loopexit, %bb13 + store volatile i32 0, i32 addrspace(1)* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll index 9dd99efd997c..97dc67f82607 100644 --- a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll +++ b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll @@ -9,7 +9,7 @@ @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4 ; CHECK-DAG: Name: load_extern_const_init -define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void @@ -19,7 +19,7 @@ define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { @undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4 ; CHECK-DAG: Name: undef_const_addrspace -define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @load_undef_const_init(i32 addrspace(1)* %out) nounwind { %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll index fd66b0b5d1f6..8a7bf6db5b8d 100644 --- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll +++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16: ; SI: s_load_dword s ; SI: buffer_store_short v -define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { %trunc = trunc i32 %arg to i16 store i16 %trunc, i16 addrspace(1)* %out ret void @@ -21,7 +21,7 @@ define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounw ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16: ; SI: buffer_load_dword v ; SI: buffer_store_short v -define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid @@ -34,7 +34,7 @@ define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspa ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { %trunc = trunc i32 %arg to i8 store i8 %trunc, i8 addrspace(1)* %out ret void @@ -43,7 +43,7 @@ define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwin ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8: ; SI: buffer_load_dword v ; SI: buffer_store_byte v -define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid @@ -56,7 +56,7 @@ define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { %trunc = trunc i32 %arg to i1 store i1 %trunc, i1 addrspace(1)* %out ret void @@ -65,7 +65,7 @@ define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwin ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1: ; SI: buffer_load_dword v ; SI: buffer_store_byte v -define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid @@ -78,7 +78,7 @@ define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32: ; SI: s_load_dword s ; SI: buffer_store_dword v -define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { %trunc = trunc i64 %arg to i32 store i32 %trunc, i32 addrspace(1)* %out ret void @@ -87,7 +87,7 @@ define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounw ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32: ; SI: buffer_load_dword v ; SI: buffer_store_dword v -define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -100,7 +100,7 @@ define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspa ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32: ; SI: s_load_dword s ; SI: buffer_store_dword v -define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { %srl = lshr i64 %arg, 32 %trunc = trunc i64 %srl to i32 store i32 %trunc, i32 addrspace(1)* %out @@ -110,7 +110,7 @@ define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32: ; SI: buffer_load_dword v ; SI: buffer_store_dword v -define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -125,7 +125,7 @@ define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1) ; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { %trunc = trunc i16 %arg to i8 store i8 %trunc, i8 addrspace(1)* %out ret void @@ -134,7 +134,7 @@ define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwin ; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8: ; SI: buffer_load_ubyte v ; SI: buffer_store_byte v -define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid @@ -147,7 +147,7 @@ define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { %srl = lshr i64 %arg, 32 %trunc = trunc i64 %srl to i8 store i8 %trunc, i8 addrspace(1)* %out @@ -157,7 +157,7 @@ define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8: ; SI: buffer_load_dword v ; SI: buffer_store_byte v -define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid @@ -171,7 +171,7 @@ define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8: ; SI: s_load_dword s ; SI: buffer_store_byte v -define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { +define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { %trunc = trunc i64 %arg to i8 store i8 %trunc, i8 addrspace(1)* %out ret void @@ -180,7 +180,7 @@ define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwin ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8: ; SI: buffer_load_dword v ; SI: buffer_store_byte v -define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid @@ -194,7 +194,7 @@ define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0 ; SI: s_waitcnt lgkmcnt(0) ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff -define void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %val = load i32, i32 addrspace(2)* %in %mask = and i32 %val, 65535 @@ -205,7 +205,7 @@ entry: ; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32: ; SI: buffer_load_dword v ; SI: buffer_store_dword v -define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %bc = bitcast <2 x i32> %ld to i64 %hi = lshr i64 %bc, 32 diff --git a/test/CodeGen/AMDGPU/nop-data.ll b/test/CodeGen/AMDGPU/nop-data.ll new file mode 100644 index 000000000000..b68f343097e5 --- /dev/null +++ b/test/CodeGen/AMDGPU/nop-data.ll @@ -0,0 +1,87 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s + +; CHECK: kernel0: +; CHECK-NEXT: s_endpgm +define amdgpu_kernel void @kernel0() align 256 { +entry: + ret void +} + +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 + +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 + +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 + +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 + +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 + +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 + +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 + +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 0 // 0000000001FC: BF800000 + +; CHECK-NEXT: {{^$}} +; CHECK-NEXT: kernel1: +; CHECK-NEXT: s_endpgm +define amdgpu_kernel void @kernel1(i32 addrspace(1)* addrspace(2)* %ptr.out) align 256 { +entry: + ret void +} diff --git a/test/CodeGen/AMDGPU/nullptr.ll b/test/CodeGen/AMDGPU/nullptr.ll new file mode 100644 index 000000000000..0df16da13562 --- /dev/null +++ b/test/CodeGen/AMDGPU/nullptr.ll @@ -0,0 +1,113 @@ +;RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s + +%struct.S = type { i32*, i32 addrspace(1)*, i32 addrspace(2)*, i32 addrspace(3)*, i32 addrspace(4)*, i32 addrspace(5)*} + +; CHECK-LABEL: nullptr_priv: +; CHECK-NEXT: .long 0 +@nullptr_priv = global i32* addrspacecast (i32 addrspace(4)* null to i32*) + +; CHECK-LABEL: nullptr_glob: +; CHECK-NEXT: .quad 0 +@nullptr_glob = global i32 addrspace(1)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(1)*) + +; CHECK-LABEL: nullptr_const: +; CHECK-NEXT: .quad 0 +@nullptr_const = global i32 addrspace(2)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(2)*) + +; CHECK-LABEL: nullptr_local: +; CHECK-NEXT: .long -1 +@nullptr_local = global i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*) + +; CHECK-LABEL: nullptr_region: +; CHECK-NEXT: .long -1 +@nullptr_region = global i32 addrspace(5)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(5)*) + +; CHECK-LABEL: nullptr6: +; CHECK-NEXT: .long 0 +@nullptr6 = global i32 addrspace(6)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(6)*) + +; CHECK-LABEL: nullptr7: +; CHECK-NEXT: .long 0 +@nullptr7 = global i32 addrspace(7)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(7)*) + +; CHECK-LABEL: nullptr8: +; CHECK-NEXT: .long 0 +@nullptr8 = global i32 addrspace(8)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(8)*) + +; CHECK-LABEL: nullptr9: +; CHECK-NEXT: .long 0 +@nullptr9 = global i32 addrspace(9)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(9)*) + +; CHECK-LABEL: nullptr10: +; CHECK-NEXT: .long 0 +@nullptr10 = global i32 addrspace(10)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(10)*) + +; CHECK-LABEL: nullptr11: +; CHECK-NEXT: .long 0 +@nullptr11 = global i32 addrspace(11)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(11)*) + +; CHECK-LABEL: nullptr12: +; CHECK-NEXT: .long 0 +@nullptr12 = global i32 addrspace(12)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(12)*) + +; CHECK-LABEL: nullptr13: +; CHECK-NEXT: .long 0 +@nullptr13 = global i32 addrspace(13)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(13)*) + +; CHECK-LABEL: nullptr14: +; CHECK-NEXT: .long 0 +@nullptr14 = global i32 addrspace(14)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(14)*) + +; CHECK-LABEL: nullptr15: +; CHECK-NEXT: .long 0 +@nullptr15 = global i32 addrspace(15)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(15)*) + +; CHECK-LABEL: nullptr16: +; CHECK-NEXT: .long 0 +@nullptr16 = global i32 addrspace(16)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(16)*) + +; CHECK-LABEL: nullptr17: +; CHECK-NEXT: .long 0 +@nullptr17 = global i32 addrspace(17)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(17)*) + +; CHECK-LABEL: nullptr18: +; CHECK-NEXT: .long 0 +@nullptr18 = global i32 addrspace(18)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(18)*) + +; CHECK-LABEL: nullptr19: +; CHECK-NEXT: .long 0 +@nullptr19 = global i32 addrspace(19)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(19)*) + +; CHECK-LABEL: nullptr20: +; CHECK-NEXT: .long 0 +@nullptr20 = global i32 addrspace(20)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(20)*) + +; CHECK-LABEL: nullptr21: +; CHECK-NEXT: .long 0 +@nullptr21 = global i32 addrspace(21)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(21)*) + +; CHECK-LABEL: nullptr22: +; CHECK-NEXT: .long 0 +@nullptr22 = global i32 addrspace(22)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(22)*) + +; CHECK-LABEL: nullptr23: +; CHECK-NEXT: .long 0 +@nullptr23 = global i32 addrspace(23)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(23)*) + +; CHECK-LABEL: structWithPointers: +; CHECK-NEXT: .long 0 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .quad 0 +; CHECK-NEXT: .quad 0 +; CHECK-NEXT: .long -1 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .quad 0 +; CHECK-NEXT: .long -1 +; CHECK-NEXT: .zero 4 +@structWithPointers = addrspace(1) global %struct.S { + i32* addrspacecast (i32 addrspace(4)* null to i32*), + i32 addrspace(1)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(1)*), + i32 addrspace(2)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(2)*), + i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), + i32 addrspace(4)* null, + i32 addrspace(5)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(5)*)}, align 4 diff --git a/test/CodeGen/AMDGPU/omod.ll b/test/CodeGen/AMDGPU/omod.ll new file mode 100644 index 000000000000..3fd7b13fcc58 --- /dev/null +++ b/test/CodeGen/AMDGPU/omod.ll @@ -0,0 +1,297 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; IEEE bit enabled for compute kernel, no shouldn't use. +; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} +define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd float %a, 1.0 + %div2 = fmul float %add, 0.5 + store float %div2, float addrspace(1)* %out.gep + ret void +} + +; IEEE bit enabled for compute kernel, no shouldn't use even though nsz is allowed +; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} +define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd float %a, 1.0 + %div2 = fmul float %add, 0.5 + store float %div2, float addrspace(1)* %out.gep + ret void +} + +; Only allow without IEEE bit if signed zeros are significant. +; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros: +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} +define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 { + %add = fadd float %a, 1.0 + %div2 = fmul float %add, 0.5 + store float %div2, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_div2_f32: +; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}} +define amdgpu_ps void @v_omod_div2_f32(float %a) #0 { + %add = fadd float %a, 1.0 + %div2 = fmul float %add, 0.5 + store float %div2, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_mul2_f32: +; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}} +define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 { + %add = fadd float %a, 1.0 + %div2 = fmul float %add, 2.0 + store float %div2, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_mul4_f32: +; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}} +define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 { + %add = fadd float %a, 1.0 + %div2 = fmul float %add, 4.0 + store float %div2, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32: +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}} +define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 { + %add = fadd float %a, 1.0 + %div2 = fmul float %add, 4.0 + store float %div2, float addrspace(1)* undef + store volatile float %add, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_mul4_dbg_use_f32: +; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}} +define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 { + %add = fadd float %a, 1.0 + call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10 + %div2 = fmul float %add, 4.0 + store float %div2, float addrspace(1)* undef + ret void +} + +; Clamp is applied after omod, folding both into instruction is OK. +; GCN-LABEL: {{^}}v_clamp_omod_div2_f32: +; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 clamp div:2{{$}} +define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 { + %add = fadd float %a, 1.0 + %div2 = fmul float %add, 0.5 + + %max = call float @llvm.maxnum.f32(float %div2, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + store float %clamp, float addrspace(1)* undef + ret void +} + +; Cannot fold omod into clamp +; GCN-LABEL: {{^}}v_omod_div2_clamp_f32: +; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 clamp{{$}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} +define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 { + %add = fadd float %a, 1.0 + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + %div2 = fmul float %clamp, 0.5 + store float %div2, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_div2_abs_src_f32: +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ADD]]|, 0.5{{$}} +define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 { + %add = fadd float %a, 1.0 + %abs.add = call float @llvm.fabs.f32(float %add) + %div2 = fmul float %abs.add, 0.5 + store float %div2, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_add_self_clamp_f32: +; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, v0 clamp{{$}} +define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 { + %add = fadd float %a, %a + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + store float %clamp, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_add_clamp_self_f32: +; GCN: v_max_f32_e64 [[CLAMP:v[0-9]+]], v0, v0 clamp{{$}} +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[CLAMP]], [[CLAMP]]{{$}} +define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 { + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + %add = fadd float %clamp, %clamp + store float %add, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_add_abs_self_f32: +; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0 +; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, |[[X]]|{{$}} +define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 { + %x = fadd float %a, 1.0 + %abs.x = call float @llvm.fabs.f32(float %x) + %add = fadd float %abs.x, %abs.x + store float %add, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_add_abs_x_x_f32: + +; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0 +; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[X]]{{$}} +define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 { + %x = fadd float %a, 1.0 + %abs.x = call float @llvm.fabs.f32(float %x) + %add = fadd float %abs.x, %x + store float %add, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_add_x_abs_x_f32: +; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0 +; GCN: v_add_f32_e64 v{{[0-9]+}}, [[X]], |[[X]]|{{$}} +define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 { + %x = fadd float %a, 1.0 + %abs.x = call float @llvm.fabs.f32(float %x) + %add = fadd float %x, %abs.x + store float %add, float addrspace(1)* undef + ret void +} + +; Don't fold omod into omod into another omod. +; GCN-LABEL: {{^}}v_omod_div2_omod_div2_f32: +; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} +define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 { + %add = fadd float %a, 1.0 + %div2.0 = fmul float %add, 0.5 + %div2.1 = fmul float %div2.0, 0.5 + store float %div2.1, float addrspace(1)* undef + ret void +} + +; Don't fold omod if denorms enabled +; GCN-LABEL: {{^}}v_omod_div2_f32_denormals: +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} +define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 { + %add = fadd float %a, 1.0 + %div2 = fmul float %add, 0.5 + store float %div2, float addrspace(1)* undef + ret void +} + +; Don't fold omod if denorms enabled for add form. +; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals: +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}} +define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 { + %add = fadd float %a, 1.0 + %mul2 = fadd float %add, %add + store float %mul2, float addrspace(1)* undef + ret void +} + +; Don't fold omod if denorms enabled +; GCN-LABEL: {{^}}v_omod_div2_f16_denormals: +; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} +; VI: v_mul_f16_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} +define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { + %add = fadd half %a, 1.0 + %div2 = fmul half %add, 0.5 + store half %div2, half addrspace(1)* undef + ret void +} + +; Don't fold omod if denorms enabled for add form. +; GCN-LABEL: {{^}}v_omod_mul2_f16_denormals: +; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} +; VI: v_add_f16_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}} +define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { + %add = fadd half %a, 1.0 + %mul2 = fadd half %add, %add + store half %mul2, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_div2_f16_no_denormals: +; VI-NOT: v0 +; VI: v_add_f16_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}} +define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { + %add = fadd half %a, 1.0 + %div2 = fmul half %add, 0.5 + store half %div2, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_omod_mac_to_mad: +; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} mul:2{{$}} +define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 { + %mul = fmul float %a, %a + %add = fadd float %mul, %b + %mad = fmul float %add, 2.0 + %res = fmul float %mad, %b + store float %res, float addrspace(1)* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.floor.f32(float) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 +declare double @llvm.fabs.f64(double) #1 +declare double @llvm.minnum.f64(double, double) #1 +declare double @llvm.maxnum.f64(double, double) #1 +declare half @llvm.fabs.f16(half) #1 +declare half @llvm.minnum.f16(half, half) #1 +declare half @llvm.maxnum.f16(half, half) #1 +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 + +attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "target-features"="+fp32-denormals" "no-signed-zeros-fp-math"="true" } +attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" "no-signed-zeros-fp-math"="true" } +attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug) +!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1) +!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!6 = !DISubroutineType(types: !7) +!7 = !{null, !8} +!8 = !DIBasicType(name: "float", size: 32, align: 32) +!9 = !DIExpression() +!10 = !DILocation(line: 1, column: 42, scope: !5) diff --git a/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/test/CodeGen/AMDGPU/opencl-image-metadata.ll index 0242f6d6145a..c974471c6573 100644 --- a/test/CodeGen/AMDGPU/opencl-image-metadata.ll +++ b/test/CodeGen/AMDGPU/opencl-image-metadata.ll @@ -6,7 +6,7 @@ ; EG: CF_END ; SI: s_endpgm -define void @kernel(i32 addrspace(1)* %out) { +define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) { entry: store i32 0, i32 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll index 4e5ea4b86b77..3836a2b7e599 100644 --- a/test/CodeGen/AMDGPU/operand-folding.ll +++ b/test/CodeGen/AMDGPU/operand-folding.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: {{^}}fold_sgpr: ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s -define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { +define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { entry: %tmp0 = icmp ne i32 %fold, 0 br i1 %tmp0, label %if, label %endif @@ -20,7 +20,7 @@ endif: ; CHECK-LABEL: {{^}}fold_imm: ; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5 -define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { +define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { entry: %fold = add i32 3, 2 %tmp0 = icmp ne i32 %cmp, 0 @@ -46,7 +46,7 @@ endif: ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]] ; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}, -define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { +define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { entry: %tmp0 = add i64 %val, 1 store i64 %tmp0, i64 addrspace(1)* %out @@ -61,7 +61,7 @@ entry: ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} -define void @vector_inline(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) { entry: %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp0, 1 @@ -80,7 +80,7 @@ entry: ; CHECK-LABEL: {{^}}imm_one_use: ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}} -define void @imm_one_use(i32 addrspace(1)* %out) { +define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) { entry: %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = xor i32 %tmp0, 100 @@ -94,7 +94,7 @@ entry: ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -define void @vector_imm(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) { entry: %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp0, 1 @@ -114,7 +114,7 @@ entry: ; CHECK: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]] ; CHECK: buffer_store_dword v[[LO]] -define void @no_fold_tied_subregister() { +define amdgpu_kernel void @no_fold_tied_subregister() { %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef %tmp2 = extractelement <2 x float> %tmp1, i32 0 %tmp3 = extractelement <2 x float> %tmp1, i32 1 diff --git a/test/CodeGen/AMDGPU/operand-spacing.ll b/test/CodeGen/AMDGPU/operand-spacing.ll index 127f3da220e7..fc6f070b737a 100644 --- a/test/CodeGen/AMDGPU/operand-spacing.ll +++ b/test/CodeGen/AMDGPU/operand-spacing.ll @@ -11,7 +11,7 @@ ; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]] ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]] ; GCN: buffer_store_dword [[RESULT]], -define void @add_f32(float addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) { %result = fadd float %a, %b store float %result, float addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir index 4584802ad5a7..2de6b59e59e9 100644 --- a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir +++ b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir @@ -3,7 +3,7 @@ --- | target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - define void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 { main_body: %id = call i32 @llvm.amdgcn.workitem.id.x() %cc = icmp eq i32 %id, 0 @@ -23,7 +23,7 @@ ret void } - define void @optimize_if_and_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -34,7 +34,7 @@ ret void } - define void @optimize_if_or_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -46,7 +46,7 @@ } - define void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 { main_body: %id = call i32 @llvm.amdgcn.workitem.id.x() %cc = icmp eq i32 %id, 0 @@ -67,7 +67,7 @@ ret void } - define void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -78,7 +78,7 @@ ret void } - define void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -89,7 +89,7 @@ ret void } - define void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -100,7 +100,7 @@ ret void } - define void @optimize_if_unknown_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -111,7 +111,7 @@ ret void } - define void @optimize_if_andn2_saveexec(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end @@ -122,7 +122,7 @@ ret void } - define void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v) #0 { + define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v) #0 { main_body: br i1 undef, label %if, label %end diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll index eca6909d4eb9..eb082843fb82 100644 --- a/test/CodeGen/AMDGPU/or.ll +++ b/test/CodeGen/AMDGPU/or.ll @@ -9,7 +9,7 @@ ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -28,7 +28,7 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -39,7 +39,7 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) ; FUNC-LABEL: {{^}}scalar_or_i32: ; SI: s_or_b32 -define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %or = or i32 %a, %b store i32 %or, i32 addrspace(1)* %out ret void @@ -47,7 +47,7 @@ define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { ; FUNC-LABEL: {{^}}vector_or_i32: ; SI: v_or_b32_e32 v{{[0-9]}} -define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) { +define amdgpu_kernel void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) { %loada = load i32, i32 addrspace(1)* %a %or = or i32 %loada, %b store i32 %or, i32 addrspace(1)* %out @@ -56,7 +56,7 @@ define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) ; FUNC-LABEL: {{^}}scalar_or_literal_i32: ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f -define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) { %or = or i32 %a, 99999 store i32 %or, i32 addrspace(1)* %out, align 4 ret void @@ -68,7 +68,7 @@ define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) { ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]] ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]] -define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) { %or = or i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out ret void @@ -82,7 +82,7 @@ define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) { ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] -define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %or = or i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out @@ -101,7 +101,7 @@ define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] ; SI-NOT: or_b32 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = or i64 %a, 63 store i64 %or, i64 addrspace(1)* %out ret void @@ -111,7 +111,7 @@ define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { ; SI-NOT: or_b32 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63 ; SI-NOT: or_b32 -define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %or = or i64 %a, 63 store i64 %or, i64 addrspace(1)* %out %foo = add i64 %b, 63 @@ -125,7 +125,7 @@ define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}} ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]] ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} -define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = or i64 %a, -8 store i64 %or, i64 addrspace(1)* %out ret void @@ -133,7 +133,7 @@ define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { ; FUNC-LABEL: {{^}}vector_or_literal_i32: ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { %loada = load i32, i32 addrspace(1)* %a, align 4 %or = or i32 %loada, 65535 store i32 %or, i32 addrspace(1)* %out, align 4 @@ -142,7 +142,7 @@ define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, ; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32: ; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}} -define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { %loada = load i32, i32 addrspace(1)* %a, align 4 %or = or i32 %loada, 4 store i32 %or, i32 addrspace(1)* %out, align 4 @@ -154,7 +154,7 @@ define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspac ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z ; SI: s_or_b64 -define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %or = or i64 %a, %b store i64 %or, i64 addrspace(1)* %out ret void @@ -163,7 +163,7 @@ define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; FUNC-LABEL: {{^}}vector_or_i64: ; SI: v_or_b32_e32 v{{[0-9]}} ; SI: v_or_b32_e32 v{{[0-9]}} -define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %loadb = load i64, i64 addrspace(1)* %b, align 8 %or = or i64 %loada, %loadb @@ -174,7 +174,7 @@ define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; FUNC-LABEL: {{^}}scalar_vector_or_i64: ; SI: v_or_b32_e32 v{{[0-9]}} ; SI: v_or_b32_e32 v{{[0-9]}} -define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) { +define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) { %loada = load i64, i64 addrspace(1)* %a %or = or i64 %loada, %b store i64 %or, i64 addrspace(1)* %out @@ -186,7 +186,7 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]] ; SI: s_endpgm -define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = or i64 %loada, 22470723082367 store i64 %or, i64 addrspace(1)* %out @@ -200,7 +200,7 @@ define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, ; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0 ; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}} ; SI: s_endpgm -define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = or i64 %loada, 8 store i64 %or, i64 addrspace(1)* %out @@ -213,7 +213,7 @@ define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}} ; SI: s_endpgm -define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = or i64 %loada, -8 store i64 %or, i64 addrspace(1)* %out @@ -226,7 +226,7 @@ define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace( ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]] ; SI: buffer_store_dwordx2 ; SI: s_endpgm -define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = or i64 %loada, -200 store i64 %or, i64 addrspace(1)* %out @@ -239,7 +239,7 @@ define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]] ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]] ; SI: buffer_store_dword [[VRESULT]], -define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { %add = or i64 %b, %a %trunc = trunc i64 %add to i32 store i32 %trunc, i32 addrspace(1)* %out, align 8 @@ -250,7 +250,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}} ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] -define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { +define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { %a = load float, float addrspace(1)* %in0 %b = load float, float addrspace(1)* %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -263,7 +263,7 @@ define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrs ; FUNC-LABEL: {{^}}s_or_i1: ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] -define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { %cmp0 = icmp eq i32 %a, %b %cmp1 = icmp eq i32 %c, %d %or = or i1 %cmp0, %cmp1 diff --git a/test/CodeGen/AMDGPU/over-max-lds-size.ll b/test/CodeGen/AMDGPU/over-max-lds-size.ll index 32ad9aba04ed..57777e783c56 100644 --- a/test/CodeGen/AMDGPU/over-max-lds-size.ll +++ b/test/CodeGen/AMDGPU/over-max-lds-size.ll @@ -6,7 +6,7 @@ @huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4 -define void @use_huge_lds() { +define amdgpu_kernel void @use_huge_lds() { entry: %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0 store i32 0, i32 addrspace(3)* %v0 diff --git a/test/CodeGen/AMDGPU/pack.v2f16.ll b/test/CodeGen/AMDGPU/pack.v2f16.ll new file mode 100644 index 000000000000..5a07f7ca6ae8 --- /dev/null +++ b/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -0,0 +1,219 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s + + +; GCN-LABEL: {{^}}s_pack_v2f16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { + %val0 = load volatile i32, i32 addrspace(2)* %in0 + %val1 = load volatile i32, i32 addrspace(2)* %in1 + %lo.i = trunc i32 %val0 to i16 + %hi.i = trunc i32 %val1 to i16 + %lo = bitcast i16 %lo.i to half + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}s_pack_v2f16_imm_lo: +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]] +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 { + %val1 = load i32, i32 addrspace(2)* %in1 + %hi.i = trunc i32 %val1 to i16 + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}s_pack_v2f16_imm_hi: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234 +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 { + %val0 = load i32, i32 addrspace(2)* %in0 + %lo.i = trunc i32 %val0 to i16 + %lo = bitcast i16 %lo.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] + +; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %lo.i = trunc i32 %val0 to i16 + %hi.i = trunc i32 %val1 to i16 + %lo = bitcast i16 %lo.i to half + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_user: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] + +; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] + +; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]] +define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %lo.i = trunc i32 %val0 to i16 + %hi.i = trunc i32 %val1 to i16 + %lo = bitcast i16 %lo.i to half + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + %foo = add i32 %vec.i32, 9 + store volatile i32 %foo, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_imm_lo: +; GFX9-DAG: flat_load_dword [[VAL1:v[0-9]+]] + +; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234{{$}} +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %hi.i = trunc i32 %val1 to i16 + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_lo: +; GFX9-DAG: flat_load_dword [[VAL1:v[0-9]+]] + +; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4400{{$}} +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] + +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %hi.i = trunc i32 %val1 to i16 + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half 4.0, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_imm_hi: +; GFX9-DAG: flat_load_dword [[VAL0:v[0-9]+]] + +; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234 +; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]] + +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo.i = trunc i32 %val0 to i16 + %lo = bitcast i16 %lo.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_inline_f16imm_hi: +; GFX9-DAG: flat_load_dword [[VAL:v[0-9]+]] + +; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3c00 +; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]] +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]] + +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo.i = trunc i32 %val0 to i16 + %lo = bitcast i16 %lo.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half 1.0, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_hi: +; GFX9: flat_load_dword [[VAL:v[0-9]+]] + +; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]] +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], 64, 16, [[MASKED]] + +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo.i = trunc i32 %val0 to i16 + %lo = bitcast i16 %lo.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half 0xH0040, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/pack.v2i16.ll b/test/CodeGen/AMDGPU/pack.v2i16.ll new file mode 100644 index 000000000000..8515fbc6dbae --- /dev/null +++ b/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -0,0 +1,181 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s + + +; GCN-LABEL: {{^}}s_pack_v2i16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { + %val0 = load volatile i32, i32 addrspace(2)* %in0 + %val1 = load volatile i32, i32 addrspace(2)* %in1 + %lo = trunc i32 %val0 to i16 + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}s_pack_v2i16_imm_lo: +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]] +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 { + %val1 = load i32, i32 addrspace(2)* %in1 + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}s_pack_v2i16_imm_hi: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8 +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 { + %val0 = load i32, i32 addrspace(2)* %in0 + %lo = trunc i32 %val0 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] + +; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]] +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %lo = trunc i32 %val0 to i16 + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_user: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] + +; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]] + +; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]] +define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %lo = trunc i32 %val0 to i16 + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + %foo = add i32 %vec.i32, 9 + store volatile i32 %foo, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_imm_lo: +; GFX9-DAG: flat_load_dword [[VAL1:v[0-9]+]] +; GFX9-DENORM-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}} + +; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b{{$}} +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] + +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 123, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_lo: +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] + +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, 64 +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 64, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_imm_hi: +; GFX9-DAG: flat_load_dword [[VAL0:v[0-9]+]] + +; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}} +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[VAL0]] + +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo = trunc i32 %val0 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 123, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_hi: +; GFX9: flat_load_dword [[VAL:v[0-9]+]] +; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], 7, 16, [[VAL0]] +; GFX9: ; use [[PACKED]] +define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo = trunc i32 %val0 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 7, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/packetizer.ll b/test/CodeGen/AMDGPU/packetizer.ll index 49a7c0df748f..1764d64c367f 100644 --- a/test/CodeGen/AMDGPU/packetizer.ll +++ b/test/CodeGen/AMDGPU/packetizer.ll @@ -7,7 +7,7 @@ ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z ; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W -define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { entry: %shl = sub i32 32, %e %x = add i32 %x_arg, 1 diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll index ea943a533c81..a90f200f79e3 100644 --- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll +++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -11,7 +11,7 @@ ; to do its transfomation, however now that we are using local memory for ; allocas, the transformation isn't happening. -define void @_Z9chk1D_512v() #0 { +define amdgpu_kernel void @_Z9chk1D_512v() #0 { entry: %a0 = alloca i32, align 4 %b0 = alloca i32, align 4 diff --git a/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/test/CodeGen/AMDGPU/parallelorifcollapse.ll index 1da1e91b8ab8..91116b0f65ea 100644 --- a/test/CodeGen/AMDGPU/parallelorifcollapse.ll +++ b/test/CodeGen/AMDGPU/parallelorifcollapse.ll @@ -12,7 +12,7 @@ ; CHECK: OR_INT ; CHECK-NEXT: OR_INT ; CHECK-NEXT: OR_INT -define void @_Z9chk1D_512v() #0 { +define amdgpu_kernel void @_Z9chk1D_512v() #0 { entry: %a0 = alloca i32, align 4 %b0 = alloca i32, align 4 diff --git a/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll new file mode 100644 index 000000000000..77d793201adc --- /dev/null +++ b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -0,0 +1,638 @@ +; RUN: llc -O0 -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR -check-prefix=GCN %s + +; FIXME: we should disable sdwa peephole because dead-code elimination, that +; runs after peephole, ruins this test (different register numbers) + +; Spill all SGPRs so multiple VGPRs are required for spilling all of them. + +; Ideally we only need 2 VGPRs for all spilling. The VGPRs are +; allocated per-frame index, so it's possible to get up with more. + +; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs: + +; GCN: def s[8:15] +; GCN: def s[16:23] +; GCN: def s[24:31] +; GCN: def s[32:39] +; GCN: def s[40:47] +; GCN: def s[48:55] +; GCN: def s[56:63] +; GCN: def s[64:71] +; GCN: def s[72:79] +; GCN: def s[80:87] +; GCN: def s[88:95] + +; GCN: v_writelane_b32 v0, s8, 0 +; GCN-NEXT: v_writelane_b32 v0, s9, 1 +; GCN-NEXT: v_writelane_b32 v0, s10, 2 +; GCN-NEXT: v_writelane_b32 v0, s11, 3 +; GCN-NEXT: v_writelane_b32 v0, s12, 4 +; GCN-NEXT: v_writelane_b32 v0, s13, 5 +; GCN-NEXT: v_writelane_b32 v0, s14, 6 +; GCN-NEXT: v_writelane_b32 v0, s15, 7 + +; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}} +; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 9 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12 +; GCN-NEXT: v_writelane_b32 v0, s13, 13 +; GCN-NEXT: v_writelane_b32 v0, s14, 14 +; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15 + +; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} +; GCN: v_writelane_b32 v0, s[[TMP_LO]], 16 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 17 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20 +; GCN-NEXT: v_writelane_b32 v0, s13, 21 +; GCN-NEXT: v_writelane_b32 v0, s14, 22 +; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23 + +; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} +; GCN: v_writelane_b32 v0, s[[TMP_LO]], 24 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 25 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28 +; GCN-NEXT: v_writelane_b32 v0, s13, 29 +; GCN-NEXT: v_writelane_b32 v0, s14, 30 +; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31 + +; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} +; GCN: v_writelane_b32 v0, s[[TMP_LO]], 32 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 33 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36 +; GCN-NEXT: v_writelane_b32 v0, s13, 37 +; GCN-NEXT: v_writelane_b32 v0, s14, 38 +; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39 + +; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} +; GCN: v_writelane_b32 v0, s[[TMP_LO]], 40 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 41 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44 +; GCN-NEXT: v_writelane_b32 v0, s13, 45 +; GCN-NEXT: v_writelane_b32 v0, s14, 46 +; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47 + +; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} +; GCN: v_writelane_b32 v0, s[[TMP_LO]], 48 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 49 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52 +; GCN-NEXT: v_writelane_b32 v0, s13, 53 +; GCN-NEXT: v_writelane_b32 v0, s14, 54 +; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55 + +; GCN-NEXT: v_writelane_b32 v0, s88, 56 +; GCN-NEXT: v_writelane_b32 v0, s89, 57 +; GCN-NEXT: v_writelane_b32 v0, s90, 58 +; GCN-NEXT: v_writelane_b32 v0, s91, 59 +; GCN-NEXT: v_writelane_b32 v0, s92, 60 +; GCN-NEXT: v_writelane_b32 v0, s93, 61 +; GCN-NEXT: v_writelane_b32 v0, s94, 62 +; GCN-NEXT: v_writelane_b32 v0, s95, 63 +; GCN-NEXT: v_writelane_b32 v1, s16, 0 +; GCN-NEXT: v_writelane_b32 v1, s17, 1 +; GCN-NEXT: v_writelane_b32 v1, s18, 2 +; GCN-NEXT: v_writelane_b32 v1, s19, 3 +; GCN-NEXT: v_writelane_b32 v1, s20, 4 +; GCN-NEXT: v_writelane_b32 v1, s21, 5 +; GCN-NEXT: v_writelane_b32 v1, s22, 6 +; GCN-NEXT: v_writelane_b32 v1, s23, 7 +; GCN-NEXT: v_writelane_b32 v1, s24, 8 +; GCN-NEXT: v_writelane_b32 v1, s25, 9 +; GCN-NEXT: v_writelane_b32 v1, s26, 10 +; GCN-NEXT: v_writelane_b32 v1, s27, 11 +; GCN-NEXT: v_writelane_b32 v1, s28, 12 +; GCN-NEXT: v_writelane_b32 v1, s29, 13 +; GCN-NEXT: v_writelane_b32 v1, s30, 14 +; GCN-NEXT: v_writelane_b32 v1, s31, 15 +; GCN-NEXT: v_writelane_b32 v1, s32, 16 +; GCN-NEXT: v_writelane_b32 v1, s33, 17 +; GCN-NEXT: v_writelane_b32 v1, s34, 18 +; GCN-NEXT: v_writelane_b32 v1, s35, 19 +; GCN-NEXT: v_writelane_b32 v1, s36, 20 +; GCN-NEXT: v_writelane_b32 v1, s37, 21 +; GCN-NEXT: v_writelane_b32 v1, s38, 22 +; GCN-NEXT: v_writelane_b32 v1, s39, 23 +; GCN-NEXT: v_writelane_b32 v1, s40, 24 +; GCN-NEXT: v_writelane_b32 v1, s41, 25 +; GCN-NEXT: v_writelane_b32 v1, s42, 26 +; GCN-NEXT: v_writelane_b32 v1, s43, 27 +; GCN-NEXT: v_writelane_b32 v1, s44, 28 +; GCN-NEXT: v_writelane_b32 v1, s45, 29 +; GCN-NEXT: v_writelane_b32 v1, s46, 30 +; GCN-NEXT: v_writelane_b32 v1, s47, 31 +; GCN-NEXT: v_writelane_b32 v1, s48, 32 +; GCN-NEXT: v_writelane_b32 v1, s49, 33 +; GCN-NEXT: v_writelane_b32 v1, s50, 34 +; GCN-NEXT: v_writelane_b32 v1, s51, 35 +; GCN-NEXT: v_writelane_b32 v1, s52, 36 +; GCN-NEXT: v_writelane_b32 v1, s53, 37 +; GCN-NEXT: v_writelane_b32 v1, s54, 38 +; GCN-NEXT: v_writelane_b32 v1, s55, 39 +; GCN-NEXT: v_writelane_b32 v1, s56, 40 +; GCN-NEXT: v_writelane_b32 v1, s57, 41 +; GCN-NEXT: v_writelane_b32 v1, s58, 42 +; GCN-NEXT: v_writelane_b32 v1, s59, 43 +; GCN-NEXT: v_writelane_b32 v1, s60, 44 +; GCN-NEXT: v_writelane_b32 v1, s61, 45 +; GCN-NEXT: v_writelane_b32 v1, s62, 46 +; GCN-NEXT: v_writelane_b32 v1, s63, 47 +; GCN-NEXT: v_writelane_b32 v1, s64, 48 +; GCN-NEXT: v_writelane_b32 v1, s65, 49 +; GCN-NEXT: v_writelane_b32 v1, s66, 50 +; GCN-NEXT: v_writelane_b32 v1, s67, 51 +; GCN-NEXT: v_writelane_b32 v1, s68, 52 +; GCN-NEXT: v_writelane_b32 v1, s69, 53 +; GCN-NEXT: v_writelane_b32 v1, s70, 54 +; GCN-NEXT: v_writelane_b32 v1, s71, 55 +; GCN-NEXT: v_writelane_b32 v1, s72, 56 +; GCN-NEXT: v_writelane_b32 v1, s73, 57 +; GCN-NEXT: v_writelane_b32 v1, s74, 58 +; GCN-NEXT: v_writelane_b32 v1, s75, 59 +; GCN-NEXT: v_writelane_b32 v1, s76, 60 +; GCN-NEXT: v_writelane_b32 v1, s77, 61 +; GCN-NEXT: v_writelane_b32 v1, s78, 62 +; GCN-NEXT: v_writelane_b32 v1, s79, 63 +; GCN-NEXT: v_writelane_b32 v2, s80, 0 +; GCN-NEXT: v_writelane_b32 v2, s81, 1 +; GCN-NEXT: v_writelane_b32 v2, s82, 2 +; GCN-NEXT: v_writelane_b32 v2, s83, 3 +; GCN-NEXT: v_writelane_b32 v2, s84, 4 +; GCN-NEXT: v_writelane_b32 v2, s85, 5 +; GCN-NEXT: v_writelane_b32 v2, s86, 6 +; GCN-NEXT: v_writelane_b32 v2, s87, 7 +; GCN: s_cbranch_scc1 + + +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 0 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 1 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 2 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 3 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 4 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 5 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 6 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 7 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + + +; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 0 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 1 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 2 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 3 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 4 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 5 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 6 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 7 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 8 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 9 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 10 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 11 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 12 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 13 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 14 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 15 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 16 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 17 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 18 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 19 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 20 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 21 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 22 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 23 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 24 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 25 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 26 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 27 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 28 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 29 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 30 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 31 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 32 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 33 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 34 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 35 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 36 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 37 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 38 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 39 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 40 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 41 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 42 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 43 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 44 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 45 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 46 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 47 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 48 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 49 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 50 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 51 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 52 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 53 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 54 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 55 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 56 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 57 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 58 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 59 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 60 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 61 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 62 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 63 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s{{[0-9]+}}, v2, 0 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 1 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 2 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 3 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 4 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 5 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 6 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 7 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 56 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 59 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 60 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 61 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 62 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 63 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 8 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 9 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 10 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 11 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 12 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 13 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 14 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 15 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 16 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 17 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 18 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 19 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 20 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 21 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 22 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 23 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 24 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 25 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 26 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 27 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 28 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 29 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 30 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 31 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 32 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 33 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 34 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 35 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 36 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 37 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 38 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 39 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 40 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 41 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 42 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 43 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 44 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 45 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 46 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 47 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 48 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 55 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} +define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr5 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr6 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr7 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr8 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr9 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr10 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr11 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr12 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr13 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr14 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr15 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr16 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr4) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr5) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr6) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr7) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr8) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr9) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr10) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr11) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr12) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr13) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr14) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr15) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr16) #0 + br label %ret + +ret: + ret void +} + +; Some of the lanes of an SGPR spill are in one VGPR and some forced +; into the next available VGPR. + +; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs: +; GCN: def s[24:39] + +; GCN: v_writelane_b32 v0, s24, 50 +; GCN-NEXT: v_writelane_b32 v0, s25, 51 +; GCN-NEXT: v_writelane_b32 v0, s26, 52 +; GCN-NEXT: v_writelane_b32 v0, s27, 53 +; GCN-NEXT: v_writelane_b32 v0, s28, 54 +; GCN-NEXT: v_writelane_b32 v0, s29, 55 +; GCN-NEXT: v_writelane_b32 v0, s30, 56 +; GCN-NEXT: v_writelane_b32 v0, s31, 57 +; GCN-NEXT: v_writelane_b32 v0, s32, 58 +; GCN-NEXT: v_writelane_b32 v0, s33, 59 +; GCN-NEXT: v_writelane_b32 v0, s34, 60 +; GCN-NEXT: v_writelane_b32 v0, s35, 61 +; GCN-NEXT: v_writelane_b32 v0, s36, 62 +; GCN-NEXT: v_writelane_b32 v0, s37, 63 +; GCN-NEXT: v_writelane_b32 v1, s38, 0 +; GCN-NEXT: v_writelane_b32 v1, s39, 1 + +; GCN: v_readlane_b32 s4, v0, 50 +; GCN-NEXT: v_readlane_b32 s5, v0, 51 +; GCN-NEXT: v_readlane_b32 s6, v0, 52 +; GCN-NEXT: v_readlane_b32 s7, v0, 53 +; GCN-NEXT: v_readlane_b32 s8, v0, 54 +; GCN-NEXT: v_readlane_b32 s9, v0, 55 +; GCN-NEXT: v_readlane_b32 s10, v0, 56 +; GCN-NEXT: v_readlane_b32 s11, v0, 57 +; GCN-NEXT: v_readlane_b32 s12, v0, 58 +; GCN-NEXT: v_readlane_b32 s13, v0, 59 +; GCN-NEXT: v_readlane_b32 s14, v0, 60 +; GCN-NEXT: v_readlane_b32 s15, v0, 61 +; GCN-NEXT: v_readlane_b32 s16, v0, 62 +; GCN-NEXT: v_readlane_b32 s17, v0, 63 +; GCN-NEXT: v_readlane_b32 s18, v1, 0 +; GCN-NEXT: v_readlane_b32 s19, v1, 1 +define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 { + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr5 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr5) #0 + br label %ret + +ret: + ret void +} + +; The first 64 SGPR spills can go to a VGPR, but there isn't a second +; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. + +; GCN-LABEL: {{^}}no_vgprs_last_sgpr_spill: + +; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 0 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 1 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 2 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 3 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 4 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 5 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 6 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 7 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 8 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 9 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 10 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 11 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 12 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 13 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 14 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 15 + +; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 16 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 17 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 18 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 19 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 20 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 21 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 22 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 23 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 24 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 25 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 26 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 27 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 28 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 29 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 30 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31 + +; GCN: def s[0:1] +; GCN: v_writelane_b32 v23, s0, 32 +; GCN-NEXT: v_writelane_b32 v23, s1, 33 + +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 36 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 37 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 38 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 39 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 40 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 41 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 42 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 43 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 44 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 45 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 46 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 47 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 48 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49 + +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: s_cbranch_scc1 + + +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 0 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 1 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 2 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 3 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 4 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 5 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 6 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 7 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 8 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 9 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 10 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 11 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 12 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 13 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 14 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 15 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + + +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 34 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 35 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 36 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 37 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 38 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 39 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 40 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 41 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 42 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 43 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 44 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 45 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 46 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 47 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 48 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 49 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 17 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 18 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 19 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 20 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 21 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 22 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 23 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 24 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 25 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 26 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 27 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 28 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 29 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 30 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} + +; GCN: v_readlane_b32 s0, v23, 32 +; GCN: v_readlane_b32 s1, v23, 33 +; GCN: ;;#ASMSTART +; GCN: ; use s[0:1] +define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { + call void asm sideeffect "", "~{VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}" () #0 + call void asm sideeffect "", "~{VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}" () #0 + call void asm sideeffect "", "~{VGPR16_VGPR17_VGPR18_VGPR19}"() #0 + call void asm sideeffect "", "~{VGPR20_VGPR21}"() #0 + call void asm sideeffect "", "~{VGPR22}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + br label %ret + +ret: + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } diff --git a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll index 3e0d36978ad4..4bcfe5f3d28c 100644 --- a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll +++ b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 -define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 { +define amdgpu_kernel void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %val = load i64, i64 addrspace(1)* %in.gep diff --git a/test/CodeGen/AMDGPU/predicates.ll b/test/CodeGen/AMDGPU/predicates.ll index c1af815c7b1e..566b48eb8864 100644 --- a/test/CodeGen/AMDGPU/predicates.ll +++ b/test/CodeGen/AMDGPU/predicates.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: {{^}}simple_if: ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @simple_if(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @simple_if(i32 addrspace(1)* %out, i32 %in) { entry: %cmp0 = icmp sgt i32 %in, 0 br i1 %cmp0, label %IF, label %ENDIF @@ -25,7 +25,7 @@ ENDIF: ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @simple_if_else(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp sgt i32 %in, 0 br i1 %0, label %IF, label %ELSE @@ -51,7 +51,7 @@ ENDIF: ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @nested_if(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @nested_if(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp sgt i32 %in, 0 br i1 %0, label %IF0, label %ENDIF @@ -79,7 +79,7 @@ ENDIF: ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @nested_if_else(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp sgt i32 %in, 0 br i1 %0, label %IF0, label %ENDIF diff --git a/test/CodeGen/AMDGPU/private-access-no-objects.ll b/test/CodeGen/AMDGPU/private-access-no-objects.ll index 2894730eccb1..af2683510293 100644 --- a/test/CodeGen/AMDGPU/private-access-no-objects.ll +++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -18,7 +18,7 @@ ; OPTNONE-NOT: s_mov_b32 ; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} -define void @store_to_undef() #0 { +define amdgpu_kernel void @store_to_undef() #0 { store volatile i32 0, i32* undef ret void } @@ -28,7 +28,7 @@ define void @store_to_undef() #0 { ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} -define void @store_to_inttoptr() #0 { +define amdgpu_kernel void @store_to_inttoptr() #0 { store volatile i32 0, i32* inttoptr (i32 123 to i32*) ret void } @@ -38,7 +38,7 @@ define void @store_to_inttoptr() #0 { ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} -define void @load_from_undef() #0 { +define amdgpu_kernel void @load_from_undef() #0 { %ld = load volatile i32, i32* undef ret void } @@ -48,7 +48,7 @@ define void @load_from_undef() #0 { ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} -define void @load_from_inttoptr() #0 { +define amdgpu_kernel void @load_from_inttoptr() #0 { %ld = load volatile i32, i32* inttoptr (i32 123 to i32*) ret void } diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll index de9a8f755122..f80543079701 100644 --- a/test/CodeGen/AMDGPU/private-element-size.ll +++ b/test/CodeGen/AMDGPU/private-element-size.ll @@ -10,33 +10,33 @@ ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24{{$}} ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} -define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} +define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 @@ -59,36 +59,28 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:56 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:88 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:72 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} @@ -97,6 +89,14 @@ entry: ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:52{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:56{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:60{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:64{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:68{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:72{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:76{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:80{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:84{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:88{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:92{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} @@ -106,7 +106,7 @@ entry: ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}} -define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 @@ -130,20 +130,20 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:1 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:2 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 @@ -166,20 +166,20 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 @@ -202,33 +202,33 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16{{$}} ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} -define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %idxprom = sext i32 %tid to i64 diff --git a/test/CodeGen/AMDGPU/private-memory-atomics.ll b/test/CodeGen/AMDGPU/private-memory-atomics.ll index eea10c862238..9fa3051928a0 100644 --- a/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -4,7 +4,7 @@ ; This works because promote allocas pass replaces these with LDS atomics. ; Private atomics have no real use, but at least shouldn't crash on it. -define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 @@ -17,7 +17,7 @@ entry: ret void } -define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 diff --git a/test/CodeGen/AMDGPU/private-memory-broken.ll b/test/CodeGen/AMDGPU/private-memory-broken.ll index 8ba0b70dbdbb..9b5f655f1b52 100644 --- a/test/CodeGen/AMDGPU/private-memory-broken.ll +++ b/test/CodeGen/AMDGPU/private-memory-broken.ll @@ -7,7 +7,7 @@ declare i32 @foo(i32*) nounwind -define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 diff --git a/test/CodeGen/AMDGPU/private-memory-r600.ll b/test/CodeGen/AMDGPU/private-memory-r600.ll index 3e1796959aa6..866cd16ec3b5 100644 --- a/test/CodeGen/AMDGPU/private-memory-r600.ll +++ b/test/CodeGen/AMDGPU/private-memory-r600.ll @@ -12,11 +12,11 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0 ; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0 -; OPT: call i32 @llvm.r600.read.tidig.x(), !range !0 -; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0 -; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0 +; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1 +; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1 +; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1 -define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -47,7 +47,7 @@ entry: ; R600-NOT: MOVA_INT %struct.point = type { i32, i32 } -define void @multiple_structs(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 { entry: %a = alloca %struct.point %b = alloca %struct.point @@ -75,7 +75,7 @@ entry: ; FUNC-LABEL: {{^}}direct_loop: ; R600-NOT: MOVA_INT -define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: %prv_array_const = alloca [2 x i32] %prv_array = alloca [2 x i32] @@ -110,7 +110,7 @@ for.end: ; FUNC-LABEL: {{^}}short_array: ; R600: MOVA_INT -define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i16] %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0 @@ -127,7 +127,7 @@ entry: ; FUNC-LABEL: {{^}}char_array: ; R600: MOVA_INT -define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i8] %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0 @@ -148,7 +148,7 @@ entry: ; R600-NOT: MOV T0.X ; Additional check in case the move ends up in the last slot ; R600-NOT: MOV * TO.X -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [2 x i32] %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0 @@ -169,7 +169,7 @@ entry: ; R600_CHECK: MOV ; R600_CHECK: [[CHAN:[XYZW]]]+ ; R600-NOT: [[CHAN]]+ -define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1 %1 = alloca [2 x i8], align 1 @@ -193,7 +193,7 @@ entry: ret void } -define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i8]] %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 @@ -207,7 +207,7 @@ entry: ret void } -define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 @@ -220,7 +220,7 @@ entry: ret void } -define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i64]] %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 @@ -235,7 +235,7 @@ entry: %struct.pair32 = type { i32, i32 } -define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x %struct.pair32]] %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 @@ -248,7 +248,7 @@ entry: ret void } -define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x %struct.pair32] %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 @@ -261,7 +261,7 @@ entry: ret void } -define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { +define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: %tmp = alloca [2 x i32] %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 @@ -282,7 +282,7 @@ entry: ; SI-NOT: ds_write ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; -define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32] %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a store i32 5, i32* %tmp0 @@ -295,6 +295,7 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ret void } -; OPT: !0 = !{i32 0, i32 2048} +; OPT: !0 = !{i32 0, i32 257} +; OPT: !1 = !{i32 0, i32 256} attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" } diff --git a/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll index 3bd0aecf7aa9..41a68b18b0a7 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: @array_alloca( ; CHECK: %stack = alloca i32, i32 5, align 4 -define void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca i32, i32 5, align 4 %ld0 = load i32, i32 addrspace(1)* %in, align 4 @@ -27,7 +27,7 @@ entry: ; CHECK-LABEL: @array_alloca_dynamic( ; CHECK: %stack = alloca i32, i32 %size, align 4 -define void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 { +define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 { entry: %stack = alloca i32, i32 %size, align 4 %ld0 = load i32, i32 addrspace(1)* %in, align 4 diff --git a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll index 82030f377d9f..a5eb92de9e4b 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll @@ -7,14 +7,14 @@ declare void @foo(float*) #0 declare void @foo.varargs(...) #0 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo -define void @crash_call_constexpr_cast() #0 { +define amdgpu_kernel void @crash_call_constexpr_cast() #0 { %alloca = alloca i32 call void bitcast (void (float*)* @foo to void (i32*)*)(i32* %alloca) #0 ret void } ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs -define void @crash_call_constexpr_cast_varargs() #0 { +define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 { %alloca = alloca i32 call void bitcast (void (...)* @foo.varargs to void (i32*)*)(i32* %alloca) #0 ret void diff --git a/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/test/CodeGen/AMDGPU/promote-alloca-globals.ll index eb0d0cc62697..38db51d4c8c6 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -5,12 +5,12 @@ @global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4 -; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { ; IR: alloca [10 x i32] ; ASM-LABEL: {{^}}promote_alloca_size_256: ; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only) -define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: %stack = alloca [10 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll index 4c3c15dac0d1..f83eb56dc6ed 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll @@ -7,7 +7,7 @@ declare i8* @llvm.invariant.group.barrier(i8*) #1 ; GCN-LABEL: {{^}}use_invariant_promotable_lds: ; GCN: buffer_load_dword ; GCN: ds_write_b32 -define void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 { +define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 { bb: %tmp = alloca i32, align 4 %tmp1 = bitcast i32* %tmp to i8* diff --git a/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll index eeda19fa27ac..bd4571a9616b 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll @@ -1,21 +1,21 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca %s | FileCheck -check-prefix=OPT %s -declare void @llvm.lifetime.start(i64, i8* nocapture) #0 -declare void @llvm.lifetime.end(i64, i8* nocapture) #0 +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 ; OPT-LABEL: @use_lifetime_promotable_lds( ; OPT-NOT: alloca i32 ; OPT-NOT: llvm.lifetime ; OPT: store i32 %tmp3, i32 addrspace(3)* -define void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 { +define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 { bb: %tmp = alloca i32, align 4 %tmp1 = bitcast i32* %tmp to i8* - call void @llvm.lifetime.start(i64 4, i8* %tmp1) + call void @llvm.lifetime.start.p0i8(i64 4, i8* %tmp1) %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 %tmp3 = load i32, i32 addrspace(1)* %tmp2 store i32 %tmp3, i32* %tmp - call void @llvm.lifetime.end(i64 4, i8* %tmp1) + call void @llvm.lifetime.end.p0i8(i64 4, i8* %tmp1) ret void } diff --git a/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll index 9cea1a23ea98..7a4a451ff360 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -8,13 +8,13 @@ declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocaptu declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0 -declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) #1 +declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) #1 ; CHECK-LABEL: @promote_with_memcpy( ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false) ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false) -define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* @@ -28,7 +28,7 @@ define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false) ; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false) -define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* @@ -41,7 +41,7 @@ define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; CHECK-LABEL: @promote_with_memset( ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false) -define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* @@ -52,11 +52,11 @@ define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; CHECK-LABEL: @promote_with_objectsize( ; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}} -; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false) -define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 { +; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false) +define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 { %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* - %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false) + %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false, i1 false) store i32 %size, i32 addrspace(1)* %out ret void } diff --git a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll index 8ba849e5f884..9f22f2071797 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll @@ -5,7 +5,7 @@ ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}} ; NOOPTS-NOT ds_write ; OPTS: ds_write -define void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 @@ -21,7 +21,7 @@ entry: ; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array: ; ALL: workgroup_group_segment_byte_size = 0{{$}} ; ALL-NOT ds_write -define void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 { +define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 diff --git a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll index 468a789e4a67..bf3bc493a4b8 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -30,7 +30,7 @@ ; GCN-LABEL: {{^}}promote_alloca_size_order_0: ; GCN: workgroup_group_segment_byte_size = 2340 -define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4 %tmp0 = load i32, i32 addrspace(1)* %in, align 4 @@ -62,7 +62,7 @@ entry: ; GCN-LABEL: {{^}}promote_alloca_size_order_1: ; GCN: workgroup_group_segment_byte_size = 2352 -define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4 %tmp0 = load i32, i32 addrspace(1)* %in, align 4 @@ -100,7 +100,7 @@ entry: ; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit: ; GCN: workgroup_group_segment_byte_size = 1060 -define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4 %tmp0 = load i32, i32 addrspace(1)* %in, align 4 diff --git a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll index 3bcbb4f986b7..03ce116cfcad 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}stored_lds_pointer_value: ; GCN: buffer_store_dword v -define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 { %tmp = alloca float store float 0.0, float *%tmp store float* %tmp, float* addrspace(1)* %ptr @@ -14,7 +14,7 @@ define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 { ; GCN-LABEL: {{^}}stored_lds_pointer_value_offset: ; GCN: buffer_store_dword v -define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 { %tmp0 = alloca float %tmp1 = alloca float store float 0.0, float *%tmp0 @@ -29,7 +29,7 @@ define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 { ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v -define void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 { +define amdgpu_kernel void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 { bb: %tmp = alloca float, i32 16 store float 0.0, float* %tmp @@ -46,7 +46,7 @@ bb: ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dword -define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) { entry: %tmp0 = alloca [4 x i32] %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0 @@ -64,7 +64,7 @@ entry: ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-NOT: ds_ -define void @stored_fi_to_self() #0 { +define amdgpu_kernel void @stored_fi_to_self() #0 { %tmp = alloca i32* store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp %bitcast = bitcast i32** %tmp to i32* diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll index 2e7527dbdbc4..ebef61229905 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll @@ -8,7 +8,7 @@ ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1 -define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b @@ -22,7 +22,7 @@ define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %ou ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, null -define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %cmp = icmp eq i32* %ptr0, null @@ -35,7 +35,7 @@ define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a ; CHECK: %cmp = icmp eq i32 addrspace(3)* null, %ptr0 -define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %cmp = icmp eq i32* null, %ptr0 @@ -49,7 +49,7 @@ define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a ; CHECK: %ptr1 = call i32* @get_unknown_pointer() ; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1 -define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %ptr1 = call i32* @get_unknown_pointer() diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll index 0462a351c39b..d196897d67dc 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll @@ -13,7 +13,7 @@ ; CHECK: endif: ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 -define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4 br i1 undef, label %if, label %else @@ -34,7 +34,7 @@ endif: ; CHECK-LABEL: @branch_ptr_phi_alloca_null_0( ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ] -define void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4 br i1 undef, label %if, label %endif @@ -51,7 +51,7 @@ endif: ; CHECK-LABEL: @branch_ptr_phi_alloca_null_1( ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ null, %entry ], [ %arrayidx0, %if ] -define void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4 br i1 undef, label %if, label %endif @@ -73,7 +73,7 @@ endif: ; CHECK: br label %exit ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ] ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 -define void @one_phi_value(i32 %a) #0 { +define amdgpu_kernel void @one_phi_value(i32 %a) #0 { entry: %alloca = alloca [64 x i32], align 4 %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a @@ -97,7 +97,7 @@ exit: ; CHECK: endif: ; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ] ; CHECK: store i32 0, i32* %phi.ptr, align 4 -define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4 br i1 undef, label %if, label %else @@ -134,7 +134,7 @@ endif: ; CHECK-LABEL: @ptr_induction_var_same_alloca( ; CHECK: %alloca = alloca [64 x i32], align 4 ; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] -define void @ptr_induction_var_same_alloca() #0 { +define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 { entry: %alloca = alloca [64 x i32], align 4 %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2 @@ -172,7 +172,7 @@ for.body: ; preds = %for.body, %entry ; CHECK: %alloca = alloca [64 x i32], align 4 ; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] ; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call -define void @ptr_induction_var_alloca_unknown() #0 { +define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 { entry: %alloca = alloca [64 x i32], align 4 %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2 diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll index 34d274df7387..55c2229fb6bd 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand( ; CHECK: %alloca = alloca i32 ; CHECK: select i1 undef, i32* undef, i32* %alloca -define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 { +define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 { %alloca = alloca i32, align 4 %select = select i1 undef, i32* undef, i32* %alloca store i32 0, i32* %select, align 4 @@ -16,7 +16,7 @@ define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 { ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4 -define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b @@ -33,7 +33,7 @@ define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 { ; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a ; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b ; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1 -define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 { +define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 { %alloca0 = alloca i32, i32 16, align 4 %alloca1 = alloca i32, i32 16, align 4 %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a @@ -50,7 +50,7 @@ define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 { ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4 -define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 { +define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1 %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3 @@ -67,7 +67,7 @@ define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 { ; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 ; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2 ; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4 -define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 { +define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 { %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b @@ -78,7 +78,7 @@ define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 ret void } -define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 { +define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 { entry: %alloca = alloca [16 x i32], align 4 %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a @@ -102,7 +102,7 @@ bb2: ; CHECK-LABEL: @select_null_rhs( ; CHECK-NOT: alloca ; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null -define void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { +define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { bb: %tmp = alloca double, align 8 store double 0.000000e+00, double* %tmp, align 8 @@ -117,7 +117,7 @@ bb: ; CHECK-LABEL: @select_null_lhs( ; CHECK-NOT: alloca ; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}} -define void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { +define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 { bb: %tmp = alloca double, align 8 store double 0.000000e+00, double* %tmp, align 8 diff --git a/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll index e331731f90f6..88c0e911662d 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll @@ -8,7 +8,7 @@ declare void @llvm.stackrestore(i8*) #2 ; CHECK-LABEL: @try_promote_unhandled_intrinsic( ; CHECK: alloca ; CHECK: call void @llvm.stackrestore(i8* %tmp1) -define void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 { +define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 { bb: %tmp = alloca i32, align 4 %tmp1 = bitcast i32* %tmp to i8* diff --git a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll index f9de38839bc5..9c43a6dc60f4 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll @@ -2,8 +2,8 @@ ; CHECK-LABEL: @volatile_load( ; CHECK: alloca [5 x i32] -; CHECK load volatile i32, i32* -define void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +; CHECK: load volatile i32, i32* +define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: %stack = alloca [5 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 @@ -15,8 +15,8 @@ entry: ; CHECK-LABEL: @volatile_store( ; CHECK: alloca [5 x i32] -; CHECK store volatile i32 %tmp, i32* -define void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +; CHECK: store volatile i32 %tmp, i32* +define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: %stack = alloca [5 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 @@ -30,7 +30,7 @@ entry: ; CHECK: alloca double ; CHECK: load double ; CHECK: load volatile double -define void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 { +define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 { bb: %tmp = alloca double, align 8 store double 0.000000e+00, double* %tmp, align 8 diff --git a/test/CodeGen/AMDGPU/pv.ll b/test/CodeGen/AMDGPU/pv.ll index d5f9833d6ad0..1474dbabba69 100644 --- a/test/CodeGen/AMDGPU/pv.ll +++ b/test/CodeGen/AMDGPU/pv.ll @@ -1,240 +1,236 @@ -; RUN: llc < %s -march=r600 | FileCheck %s +; RUN: llc -march=r600 < %s | FileCheck %s ; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED) ; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X - define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) { main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = extractelement <4 x float> %reg2, i32 0 - %5 = extractelement <4 x float> %reg2, i32 1 - %6 = extractelement <4 x float> %reg2, i32 2 - %7 = extractelement <4 x float> %reg2, i32 3 - %8 = extractelement <4 x float> %reg3, i32 0 - %9 = extractelement <4 x float> %reg3, i32 1 - %10 = extractelement <4 x float> %reg3, i32 2 - %11 = extractelement <4 x float> %reg3, i32 3 - %12 = extractelement <4 x float> %reg4, i32 0 - %13 = extractelement <4 x float> %reg4, i32 1 - %14 = extractelement <4 x float> %reg4, i32 2 - %15 = extractelement <4 x float> %reg4, i32 3 - %16 = extractelement <4 x float> %reg5, i32 0 - %17 = extractelement <4 x float> %reg5, i32 1 - %18 = extractelement <4 x float> %reg5, i32 2 - %19 = extractelement <4 x float> %reg5, i32 3 - %20 = extractelement <4 x float> %reg6, i32 0 - %21 = extractelement <4 x float> %reg6, i32 1 - %22 = extractelement <4 x float> %reg6, i32 2 - %23 = extractelement <4 x float> %reg6, i32 3 - %24 = extractelement <4 x float> %reg7, i32 0 - %25 = extractelement <4 x float> %reg7, i32 1 - %26 = extractelement <4 x float> %reg7, i32 2 - %27 = extractelement <4 x float> %reg7, i32 3 - %28 = load <4 x float>, <4 x float> addrspace(8)* null - %29 = extractelement <4 x float> %28, i32 0 - %30 = fmul float %0, %29 - %31 = load <4 x float>, <4 x float> addrspace(8)* null - %32 = extractelement <4 x float> %31, i32 1 - %33 = fmul float %0, %32 - %34 = load <4 x float>, <4 x float> addrspace(8)* null - %35 = extractelement <4 x float> %34, i32 2 - %36 = fmul float %0, %35 - %37 = load <4 x float>, <4 x float> addrspace(8)* null - %38 = extractelement <4 x float> %37, i32 3 - %39 = fmul float %0, %38 - %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %41 = extractelement <4 x float> %40, i32 0 - %42 = fmul float %1, %41 - %43 = fadd float %42, %30 - %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %45 = extractelement <4 x float> %44, i32 1 - %46 = fmul float %1, %45 - %47 = fadd float %46, %33 - %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %49 = extractelement <4 x float> %48, i32 2 - %50 = fmul float %1, %49 - %51 = fadd float %50, %36 - %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %53 = extractelement <4 x float> %52, i32 3 - %54 = fmul float %1, %53 - %55 = fadd float %54, %39 - %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %57 = extractelement <4 x float> %56, i32 0 - %58 = fmul float %2, %57 - %59 = fadd float %58, %43 - %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %61 = extractelement <4 x float> %60, i32 1 - %62 = fmul float %2, %61 - %63 = fadd float %62, %47 - %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %65 = extractelement <4 x float> %64, i32 2 - %66 = fmul float %2, %65 - %67 = fadd float %66, %51 - %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %69 = extractelement <4 x float> %68, i32 3 - %70 = fmul float %2, %69 - %71 = fadd float %70, %55 - %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %73 = extractelement <4 x float> %72, i32 0 - %74 = fmul float %3, %73 - %75 = fadd float %74, %59 - %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %77 = extractelement <4 x float> %76, i32 1 - %78 = fmul float %3, %77 - %79 = fadd float %78, %63 - %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %81 = extractelement <4 x float> %80, i32 2 - %82 = fmul float %3, %81 - %83 = fadd float %82, %67 - %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %85 = extractelement <4 x float> %84, i32 3 - %86 = fmul float %3, %85 - %87 = fadd float %86, %71 - %88 = insertelement <4 x float> undef, float %4, i32 0 - %89 = insertelement <4 x float> %88, float %5, i32 1 - %90 = insertelement <4 x float> %89, float %6, i32 2 - %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3 - %92 = insertelement <4 x float> undef, float %4, i32 0 - %93 = insertelement <4 x float> %92, float %5, i32 1 - %94 = insertelement <4 x float> %93, float %6, i32 2 - %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3 - %96 = call float @llvm.r600.dot4(<4 x float> %91, <4 x float> %95) - %97 = call float @llvm.fabs.f32(float %96) - %98 = call float @llvm.r600.recipsqrt.clamped.f32(float %97) - %99 = fmul float %4, %98 - %100 = fmul float %5, %98 - %101 = fmul float %6, %98 - %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %103 = extractelement <4 x float> %102, i32 0 - %104 = fmul float %103, %8 - %105 = fadd float %104, %20 - %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %107 = extractelement <4 x float> %106, i32 1 - %108 = fmul float %107, %9 - %109 = fadd float %108, %21 - %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %111 = extractelement <4 x float> %110, i32 2 - %112 = fmul float %111, %10 - %113 = fadd float %112, %22 - %114 = call float @llvm.AMDGPU.clamp.f32(float %105, float 0.000000e+00, float 1.000000e+00) - %115 = call float @llvm.AMDGPU.clamp.f32(float %109, float 0.000000e+00, float 1.000000e+00) - %116 = call float @llvm.AMDGPU.clamp.f32(float %113, float 0.000000e+00, float 1.000000e+00) - %117 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00) - %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %119 = extractelement <4 x float> %118, i32 0 - %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %121 = extractelement <4 x float> %120, i32 1 - %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %123 = extractelement <4 x float> %122, i32 2 - %124 = insertelement <4 x float> undef, float %99, i32 0 - %125 = insertelement <4 x float> %124, float %100, i32 1 - %126 = insertelement <4 x float> %125, float %101, i32 2 - %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3 - %128 = insertelement <4 x float> undef, float %119, i32 0 - %129 = insertelement <4 x float> %128, float %121, i32 1 - %130 = insertelement <4 x float> %129, float %123, i32 2 - %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3 - %132 = call float @llvm.r600.dot4(<4 x float> %127, <4 x float> %131) - %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %134 = extractelement <4 x float> %133, i32 0 - %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %136 = extractelement <4 x float> %135, i32 1 - %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %138 = extractelement <4 x float> %137, i32 2 - %139 = insertelement <4 x float> undef, float %99, i32 0 - %140 = insertelement <4 x float> %139, float %100, i32 1 - %141 = insertelement <4 x float> %140, float %101, i32 2 - %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3 - %143 = insertelement <4 x float> undef, float %134, i32 0 - %144 = insertelement <4 x float> %143, float %136, i32 1 - %145 = insertelement <4 x float> %144, float %138, i32 2 - %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3 - %147 = call float @llvm.r600.dot4(<4 x float> %142, <4 x float> %146) - %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %149 = extractelement <4 x float> %148, i32 0 - %150 = fmul float %149, %8 - %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %152 = extractelement <4 x float> %151, i32 1 - %153 = fmul float %152, %9 - %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %155 = extractelement <4 x float> %154, i32 2 - %156 = fmul float %155, %10 - %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %158 = extractelement <4 x float> %157, i32 0 - %159 = fmul float %158, %12 - %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %161 = extractelement <4 x float> %160, i32 1 - %162 = fmul float %161, %13 - %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %164 = extractelement <4 x float> %163, i32 2 - %165 = fmul float %164, %14 - %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %167 = extractelement <4 x float> %166, i32 0 - %168 = fmul float %167, %16 - %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %170 = extractelement <4 x float> %169, i32 1 - %171 = fmul float %170, %17 - %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %173 = extractelement <4 x float> %172, i32 2 - %174 = fmul float %173, %18 - %175 = fcmp uge float %132, 0.000000e+00 - %176 = select i1 %175, float %132, float 0.000000e+00 - %177 = fcmp uge float %147, 0.000000e+00 - %178 = select i1 %177, float %147, float 0.000000e+00 - %179 = call float @llvm.pow.f32(float %178, float %24) - %180 = fcmp ult float %132, 0.000000e+00 - %181 = select i1 %180, float 0.000000e+00, float %179 - %182 = fadd float %150, %105 - %183 = fadd float %153, %109 - %184 = fadd float %156, %113 - %185 = fmul float %176, %159 - %186 = fadd float %185, %182 - %187 = fmul float %176, %162 - %188 = fadd float %187, %183 - %189 = fmul float %176, %165 - %190 = fadd float %189, %184 - %191 = fmul float %181, %168 - %192 = fadd float %191, %186 - %193 = fmul float %181, %171 - %194 = fadd float %193, %188 - %195 = fmul float %181, %174 - %196 = fadd float %195, %190 - %197 = call float @llvm.AMDGPU.clamp.f32(float %192, float 0.000000e+00, float 1.000000e+00) - %198 = call float @llvm.AMDGPU.clamp.f32(float %194, float 0.000000e+00, float 1.000000e+00) - %199 = call float @llvm.AMDGPU.clamp.f32(float %196, float 0.000000e+00, float 1.000000e+00) - %200 = insertelement <4 x float> undef, float %75, i32 0 - %201 = insertelement <4 x float> %200, float %79, i32 1 - %202 = insertelement <4 x float> %201, float %83, i32 2 - %203 = insertelement <4 x float> %202, float %87, i32 3 - call void @llvm.r600.store.swizzle(<4 x float> %203, i32 60, i32 1) - %204 = insertelement <4 x float> undef, float %197, i32 0 - %205 = insertelement <4 x float> %204, float %198, i32 1 - %206 = insertelement <4 x float> %205, float %199, i32 2 - %207 = insertelement <4 x float> %206, float %117, i32 3 - call void @llvm.r600.store.swizzle(<4 x float> %207, i32 0, i32 2) + %tmp = extractelement <4 x float> %reg1, i32 0 + %tmp13 = extractelement <4 x float> %reg1, i32 1 + %tmp14 = extractelement <4 x float> %reg1, i32 2 + %tmp15 = extractelement <4 x float> %reg1, i32 3 + %tmp16 = extractelement <4 x float> %reg2, i32 0 + %tmp17 = extractelement <4 x float> %reg2, i32 1 + %tmp18 = extractelement <4 x float> %reg2, i32 2 + %tmp19 = extractelement <4 x float> %reg2, i32 3 + %tmp20 = extractelement <4 x float> %reg3, i32 0 + %tmp21 = extractelement <4 x float> %reg3, i32 1 + %tmp22 = extractelement <4 x float> %reg3, i32 2 + %tmp23 = extractelement <4 x float> %reg3, i32 3 + %tmp24 = extractelement <4 x float> %reg4, i32 0 + %tmp25 = extractelement <4 x float> %reg4, i32 1 + %tmp26 = extractelement <4 x float> %reg4, i32 2 + %tmp27 = extractelement <4 x float> %reg4, i32 3 + %tmp28 = extractelement <4 x float> %reg5, i32 0 + %tmp29 = extractelement <4 x float> %reg5, i32 1 + %tmp30 = extractelement <4 x float> %reg5, i32 2 + %tmp31 = extractelement <4 x float> %reg5, i32 3 + %tmp32 = extractelement <4 x float> %reg6, i32 0 + %tmp33 = extractelement <4 x float> %reg6, i32 1 + %tmp34 = extractelement <4 x float> %reg6, i32 2 + %tmp35 = extractelement <4 x float> %reg6, i32 3 + %tmp36 = extractelement <4 x float> %reg7, i32 0 + %tmp37 = extractelement <4 x float> %reg7, i32 1 + %tmp38 = extractelement <4 x float> %reg7, i32 2 + %tmp39 = extractelement <4 x float> %reg7, i32 3 + %tmp40 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp41 = extractelement <4 x float> %tmp40, i32 0 + %tmp42 = fmul float %tmp, %tmp41 + %tmp43 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp44 = extractelement <4 x float> %tmp43, i32 1 + %tmp45 = fmul float %tmp, %tmp44 + %tmp46 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp47 = extractelement <4 x float> %tmp46, i32 2 + %tmp48 = fmul float %tmp, %tmp47 + %tmp49 = load <4 x float>, <4 x float> addrspace(8)* null + %tmp50 = extractelement <4 x float> %tmp49, i32 3 + %tmp51 = fmul float %tmp, %tmp50 + %tmp52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp53 = extractelement <4 x float> %tmp52, i32 0 + %tmp54 = fmul float %tmp13, %tmp53 + %tmp55 = fadd float %tmp54, %tmp42 + %tmp56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp57 = extractelement <4 x float> %tmp56, i32 1 + %tmp58 = fmul float %tmp13, %tmp57 + %tmp59 = fadd float %tmp58, %tmp45 + %tmp60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp61 = extractelement <4 x float> %tmp60, i32 2 + %tmp62 = fmul float %tmp13, %tmp61 + %tmp63 = fadd float %tmp62, %tmp48 + %tmp64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %tmp65 = extractelement <4 x float> %tmp64, i32 3 + %tmp66 = fmul float %tmp13, %tmp65 + %tmp67 = fadd float %tmp66, %tmp51 + %tmp68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp69 = extractelement <4 x float> %tmp68, i32 0 + %tmp70 = fmul float %tmp14, %tmp69 + %tmp71 = fadd float %tmp70, %tmp55 + %tmp72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp73 = extractelement <4 x float> %tmp72, i32 1 + %tmp74 = fmul float %tmp14, %tmp73 + %tmp75 = fadd float %tmp74, %tmp59 + %tmp76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp77 = extractelement <4 x float> %tmp76, i32 2 + %tmp78 = fmul float %tmp14, %tmp77 + %tmp79 = fadd float %tmp78, %tmp63 + %tmp80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %tmp81 = extractelement <4 x float> %tmp80, i32 3 + %tmp82 = fmul float %tmp14, %tmp81 + %tmp83 = fadd float %tmp82, %tmp67 + %tmp84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %tmp85 = extractelement <4 x float> %tmp84, i32 0 + %tmp86 = fmul float %tmp15, %tmp85 + %tmp87 = fadd float %tmp86, %tmp71 + %tmp88 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %tmp89 = extractelement <4 x float> %tmp88, i32 1 + %tmp90 = fmul float %tmp15, %tmp89 + %tmp91 = fadd float %tmp90, %tmp75 + %tmp92 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %tmp93 = extractelement <4 x float> %tmp92, i32 2 + %tmp94 = fmul float %tmp15, %tmp93 + %tmp95 = fadd float %tmp94, %tmp79 + %tmp96 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %tmp97 = extractelement <4 x float> %tmp96, i32 3 + %tmp98 = fmul float %tmp15, %tmp97 + %tmp99 = fadd float %tmp98, %tmp83 + %tmp100 = insertelement <4 x float> undef, float %tmp16, i32 0 + %tmp101 = insertelement <4 x float> %tmp100, float %tmp17, i32 1 + %tmp102 = insertelement <4 x float> %tmp101, float %tmp18, i32 2 + %tmp103 = insertelement <4 x float> %tmp102, float 0.000000e+00, i32 3 + %tmp104 = insertelement <4 x float> undef, float %tmp16, i32 0 + %tmp105 = insertelement <4 x float> %tmp104, float %tmp17, i32 1 + %tmp106 = insertelement <4 x float> %tmp105, float %tmp18, i32 2 + %tmp107 = insertelement <4 x float> %tmp106, float 0.000000e+00, i32 3 + %tmp108 = call float @llvm.r600.dot4(<4 x float> %tmp103, <4 x float> %tmp107) + %tmp109 = call float @llvm.fabs.f32(float %tmp108) + %tmp110 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp109) + %tmp111 = fmul float %tmp16, %tmp110 + %tmp112 = fmul float %tmp17, %tmp110 + %tmp113 = fmul float %tmp18, %tmp110 + %tmp114 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %tmp115 = extractelement <4 x float> %tmp114, i32 0 + %tmp116 = fmul float %tmp115, %tmp20 + %tmp117 = fadd float %tmp116, %tmp32 + %tmp118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %tmp119 = extractelement <4 x float> %tmp118, i32 1 + %tmp120 = fmul float %tmp119, %tmp21 + %tmp121 = fadd float %tmp120, %tmp33 + %tmp122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %tmp123 = extractelement <4 x float> %tmp122, i32 2 + %tmp124 = fmul float %tmp123, %tmp22 + %tmp125 = fadd float %tmp124, %tmp34 + %max.0.i = call float @llvm.maxnum.f32(float %tmp117, float 0.000000e+00) + %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00) + %max.0.i11 = call float @llvm.maxnum.f32(float %tmp121, float 0.000000e+00) + %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00) + %max.0.i9 = call float @llvm.maxnum.f32(float %tmp125, float 0.000000e+00) + %clamp.i10 = call float @llvm.minnum.f32(float %max.0.i9, float 1.000000e+00) + %max.0.i7 = call float @llvm.maxnum.f32(float %tmp27, float 0.000000e+00) + %clamp.i8 = call float @llvm.minnum.f32(float %max.0.i7, float 1.000000e+00) + %tmp126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %tmp127 = extractelement <4 x float> %tmp126, i32 0 + %tmp128 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %tmp129 = extractelement <4 x float> %tmp128, i32 1 + %tmp130 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %tmp131 = extractelement <4 x float> %tmp130, i32 2 + %tmp132 = insertelement <4 x float> undef, float %tmp111, i32 0 + %tmp133 = insertelement <4 x float> %tmp132, float %tmp112, i32 1 + %tmp134 = insertelement <4 x float> %tmp133, float %tmp113, i32 2 + %tmp135 = insertelement <4 x float> %tmp134, float 0.000000e+00, i32 3 + %tmp136 = insertelement <4 x float> undef, float %tmp127, i32 0 + %tmp137 = insertelement <4 x float> %tmp136, float %tmp129, i32 1 + %tmp138 = insertelement <4 x float> %tmp137, float %tmp131, i32 2 + %tmp139 = insertelement <4 x float> %tmp138, float 0.000000e+00, i32 3 + %tmp140 = call float @llvm.r600.dot4(<4 x float> %tmp135, <4 x float> %tmp139) + %tmp141 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %tmp142 = extractelement <4 x float> %tmp141, i32 0 + %tmp143 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %tmp144 = extractelement <4 x float> %tmp143, i32 1 + %tmp145 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %tmp146 = extractelement <4 x float> %tmp145, i32 2 + %tmp147 = insertelement <4 x float> undef, float %tmp111, i32 0 + %tmp148 = insertelement <4 x float> %tmp147, float %tmp112, i32 1 + %tmp149 = insertelement <4 x float> %tmp148, float %tmp113, i32 2 + %tmp150 = insertelement <4 x float> %tmp149, float 0.000000e+00, i32 3 + %tmp151 = insertelement <4 x float> undef, float %tmp142, i32 0 + %tmp152 = insertelement <4 x float> %tmp151, float %tmp144, i32 1 + %tmp153 = insertelement <4 x float> %tmp152, float %tmp146, i32 2 + %tmp154 = insertelement <4 x float> %tmp153, float 0.000000e+00, i32 3 + %tmp155 = call float @llvm.r600.dot4(<4 x float> %tmp150, <4 x float> %tmp154) + %tmp156 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %tmp157 = extractelement <4 x float> %tmp156, i32 0 + %tmp158 = fmul float %tmp157, %tmp20 + %tmp159 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %tmp160 = extractelement <4 x float> %tmp159, i32 1 + %tmp161 = fmul float %tmp160, %tmp21 + %tmp162 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %tmp163 = extractelement <4 x float> %tmp162, i32 2 + %tmp164 = fmul float %tmp163, %tmp22 + %tmp165 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %tmp166 = extractelement <4 x float> %tmp165, i32 0 + %tmp167 = fmul float %tmp166, %tmp24 + %tmp168 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %tmp169 = extractelement <4 x float> %tmp168, i32 1 + %tmp170 = fmul float %tmp169, %tmp25 + %tmp171 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %tmp172 = extractelement <4 x float> %tmp171, i32 2 + %tmp173 = fmul float %tmp172, %tmp26 + %tmp174 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %tmp175 = extractelement <4 x float> %tmp174, i32 0 + %tmp176 = fmul float %tmp175, %tmp28 + %tmp177 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %tmp178 = extractelement <4 x float> %tmp177, i32 1 + %tmp179 = fmul float %tmp178, %tmp29 + %tmp180 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %tmp181 = extractelement <4 x float> %tmp180, i32 2 + %tmp182 = fmul float %tmp181, %tmp30 + %tmp183 = fcmp uge float %tmp140, 0.000000e+00 + %tmp184 = select i1 %tmp183, float %tmp140, float 0.000000e+00 + %tmp185 = fcmp uge float %tmp155, 0.000000e+00 + %tmp186 = select i1 %tmp185, float %tmp155, float 0.000000e+00 + %tmp187 = call float @llvm.pow.f32(float %tmp186, float %tmp36) + %tmp188 = fcmp ult float %tmp140, 0.000000e+00 + %tmp189 = select i1 %tmp188, float 0.000000e+00, float %tmp187 + %tmp190 = fadd float %tmp158, %tmp117 + %tmp191 = fadd float %tmp161, %tmp121 + %tmp192 = fadd float %tmp164, %tmp125 + %tmp193 = fmul float %tmp184, %tmp167 + %tmp194 = fadd float %tmp193, %tmp190 + %tmp195 = fmul float %tmp184, %tmp170 + %tmp196 = fadd float %tmp195, %tmp191 + %tmp197 = fmul float %tmp184, %tmp173 + %tmp198 = fadd float %tmp197, %tmp192 + %tmp199 = fmul float %tmp189, %tmp176 + %tmp200 = fadd float %tmp199, %tmp194 + %tmp201 = fmul float %tmp189, %tmp179 + %tmp202 = fadd float %tmp201, %tmp196 + %tmp203 = fmul float %tmp189, %tmp182 + %tmp204 = fadd float %tmp203, %tmp198 + %max.0.i5 = call float @llvm.maxnum.f32(float %tmp200, float 0.000000e+00) + %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00) + %max.0.i3 = call float @llvm.maxnum.f32(float %tmp202, float 0.000000e+00) + %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00) + %max.0.i1 = call float @llvm.maxnum.f32(float %tmp204, float 0.000000e+00) + %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) + %tmp205 = insertelement <4 x float> undef, float %tmp87, i32 0 + %tmp206 = insertelement <4 x float> %tmp205, float %tmp91, i32 1 + %tmp207 = insertelement <4 x float> %tmp206, float %tmp95, i32 2 + %tmp208 = insertelement <4 x float> %tmp207, float %tmp99, i32 3 + call void @llvm.r600.store.swizzle(<4 x float> %tmp208, i32 60, i32 1) + %tmp209 = insertelement <4 x float> undef, float %clamp.i6, i32 0 + %tmp210 = insertelement <4 x float> %tmp209, float %clamp.i4, i32 1 + %tmp211 = insertelement <4 x float> %tmp210, float %clamp.i2, i32 2 + %tmp212 = insertelement <4 x float> %tmp211, float %clamp.i8, i32 3 + call void @llvm.r600.store.swizzle(<4 x float> %tmp212, i32 0, i32 2) ret void } -; Function Attrs: readnone -declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1 - -; Function Attrs: readonly -declare float @llvm.fabs.f32(float) #1 - -; Function Attrs: readnone -declare float @llvm.r600.recipsqrt.clamped.f32(float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1 - -; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #2 - -declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #3 +declare float @llvm.minnum.f32(float, float) #0 +declare float @llvm.maxnum.f32(float, float) #0 +declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.r600.recipsqrt.clamped.f32(float) #0 +declare float @llvm.pow.f32(float, float) #0 +declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #1 -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind readonly } -attributes #3 = { nounwind } +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll index 461caf5b5d20..e2143ff85b72 100644 --- a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll +++ b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll @@ -10,7 +10,7 @@ main_body: %tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 1 %tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 2 %tmp8 = insertelement <4 x float> %tmp7, float %tmp4, i32 3 - %tmp9 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp8) + %tmp9 = call <4 x float> @llvm.r600.cube(<4 x float> %tmp8) %tmp10 = extractelement <4 x float> %tmp9, i32 0 %tmp11 = extractelement <4 x float> %tmp9, i32 1 %tmp12 = extractelement <4 x float> %tmp9, i32 2 @@ -45,7 +45,7 @@ main_body: } ; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0 +declare <4 x float> @llvm.r600.cube(<4 x float>) #0 ; Function Attrs: readnone declare float @fabs(float) #0 diff --git a/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll index 866a4a9191e2..b7ed34bbf09b 100644 --- a/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll +++ b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll @@ -2,7 +2,7 @@ ; Don't crash ; CHECK: MAX_UINT -define void @test(i64 addrspace(1)* %out) { +define amdgpu_kernel void @test(i64 addrspace(1)* %out) { bb: store i64 2, i64 addrspace(1)* %out %tmp = load i64, i64 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/r600.alu-limits.ll b/test/CodeGen/AMDGPU/r600.alu-limits.ll new file mode 100644 index 000000000000..2604ed4e574c --- /dev/null +++ b/test/CodeGen/AMDGPU/r600.alu-limits.ll @@ -0,0 +1,29 @@ +; RUN: opt -loop-unroll -unroll-threshold=2000 -S < %s | llc -march=r600 -mcpu=cypress | FileCheck %s +; REQUIRES: asserts + +; CHECK: {{^}}alu_limits: +; CHECK: CF_END + +%struct.foo = type {i32, i32, i32} + +define amdgpu_kernel void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) { +entry: + %ptr = getelementptr inbounds %struct.foo, %struct.foo* %in, i32 1, i32 2 + %x = load i32, i32 *%ptr, align 4 + br label %loop +loop: + %i = phi i32 [ 100, %entry ], [ %nexti, %loop ] + %val = phi i32 [ 1, %entry ], [ %nextval, %loop ] + + %nexti = sub i32 %i, 1 + + %y = xor i32 %x, %i + %nextval = mul i32 %val, %y + + %cond = icmp ne i32 %nexti, 0 + br i1 %cond, label %loop, label %end +end: + %out_val = add i32 %nextval, 4 + store i32 %out_val, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll b/test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll new file mode 100644 index 000000000000..8956d113e8b5 --- /dev/null +++ b/test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll @@ -0,0 +1,7 @@ +; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s + +; CHECK: NoAlias: i8 addrspace(7)* %p1, i8* %p + +define amdgpu_kernel void @test(i8* %p, i8 addrspace(7)* %p1) { + ret void +} diff --git a/test/CodeGen/AMDGPU/r600.bitcast.ll b/test/CodeGen/AMDGPU/r600.bitcast.ll index 49441ee8d186..acf7a66a357f 100644 --- a/test/CodeGen/AMDGPU/r600.bitcast.ll +++ b/test/CodeGen/AMDGPU/r600.bitcast.ll @@ -8,7 +8,7 @@ ; EG: VTX_READ_128 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)* %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0 @@ -21,7 +21,7 @@ entry: ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { +define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %load = load float, float addrspace(1)* %in, align 4 %bc = bitcast float %load to <2 x i16> store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4 @@ -33,7 +33,7 @@ define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 %bc = bitcast <2 x i16> %load to float store float %bc, float addrspace(1)* %out, align 4 @@ -45,7 +45,7 @@ define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %bc = bitcast <4 x i8> %load to i32 store i32 %bc, i32 addrspace(1)* %out, align 4 @@ -57,7 +57,7 @@ define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nou ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %load to <4 x i8> store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 @@ -69,7 +69,7 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 %bc = bitcast <2 x i16> %load to <4 x i8> store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 @@ -85,7 +85,7 @@ define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* ; EG: VTX_READ_16 ; EG-DAG: BFE_UINT ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 2 %bc = bitcast <4 x i16> %load to <8 x i8> %element = extractelement <8 x i8> %bc, i32 5 @@ -98,7 +98,7 @@ define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in ; EG: VTX_READ_64 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]] ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal -define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %bc = bitcast <2 x i32> %val to double store double %bc, double addrspace(1)* %out, align 8 diff --git a/test/CodeGen/AMDGPU/r600.global_atomics.ll b/test/CodeGen/AMDGPU/r600.global_atomics.ll new file mode 100644 index 000000000000..1ddc41feb006 --- /dev/null +++ b/test/CodeGen/AMDGPU/r600.global_atomics.ll @@ -0,0 +1,542 @@ +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; TODO: Add _RTN versions and merge with the GCN test + +; FUNC-LABEL: {{^}}atomic_add_i32_offset: +; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_soffset: +; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000 + %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_huge_offset: +; FIXME: looks like the offset is wrong +; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595 + + %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32: +; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_addr64: +; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_offset: +; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32: +; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_addr64: +; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_offset: +; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32: +; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_addr64: +; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_offset: +; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32: +; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_addr64: +; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_offset: +; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32: +; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_addr64: +; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_offset: +; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32: +; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_addr64: +; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_offset: +; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32: +; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_addr64: +; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_offset: +; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32: +; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_addr64: +; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_offset: +; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32: +; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64: +; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset: +; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32: +; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64: +; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_offset: +; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32: +; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_addr64: +; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z +define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_store_i32_offset: +; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y +define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4 + ret void +} + +; FUNC-LABEL: {{^}}atomic_store_i32: +; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y +define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 + ret void +} + +; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset: +; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y +define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4 + ret void +} + +; FUNC-LABEL: {{^}}atomic_store_i32_addr64: +; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]] +; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y +define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4 + ret void +} + +; FUNC-LABEL: {{^}}atomic_inc_add +; EG: MEM_RAT ATOMIC_INC_UINT +define amdgpu_kernel void @atomic_inc_add(i32 addrspace(1)* %out) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_dec_add +; EG: MEM_RAT ATOMIC_DEC_UINT +define amdgpu_kernel void @atomic_dec_add(i32 addrspace(1)* %out) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 -1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_inc_sub +; EG: MEM_RAT ATOMIC_INC_UINT +define amdgpu_kernel void @atomic_inc_sub(i32 addrspace(1)* %out) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 -1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_dec_sub +; EG: MEM_RAT ATOMIC_DEC_UINT +define amdgpu_kernel void @atomic_dec_sub(i32 addrspace(1)* %out) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst + ret void +} diff --git a/test/CodeGen/AMDGPU/r600.private-memory.ll b/test/CodeGen/AMDGPU/r600.private-memory.ll index f406c160cbbe..53ee214f07ec 100644 --- a/test/CodeGen/AMDGPU/r600.private-memory.ll +++ b/test/CodeGen/AMDGPU/r600.private-memory.ll @@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; Additional check in case the move ends up in the last slot ; R600-NOT: MOV * TO.X -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) { entry: %0 = alloca [2 x i32] %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 diff --git a/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll index a34a48e3b7ba..9eee9a6effc9 100644 --- a/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll +++ b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll @@ -2,7 +2,7 @@ ; FUNC-LABEL: {{^}}tgid_x: ; EG: MEM_RAT_CACHELESS STORE_RAW T1.X -define void @tgid_x(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -11,7 +11,7 @@ entry: ; FUNC-LABEL: {{^}}tgid_y: ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Y -define void @tgid_y(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -20,7 +20,7 @@ entry: ; FUNC-LABEL: {{^}}tgid_z: ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Z -define void @tgid_z(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tgid_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -29,7 +29,7 @@ entry: ; FUNC-LABEL: {{^}}tidig_x: ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X -define void @tidig_x(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -38,7 +38,7 @@ entry: ; FUNC-LABEL: {{^}}tidig_y: ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Y -define void @tidig_y(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -47,7 +47,7 @@ entry: ; FUNC-LABEL: {{^}}tidig_z: ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Z -define void @tidig_z(i32 addrspace(1)* %out) { +define amdgpu_kernel void @tidig_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 store i32 %0, i32 addrspace(1)* %out @@ -57,7 +57,7 @@ entry: ; FUNC-LABEL: {{^}}test_implicit: ; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56 -define void @test_implicit(i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr() %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)* %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 4 @@ -69,7 +69,7 @@ define void @test_implicit(i32 addrspace(1)* %out) #1 { ; FUNC-LABEL: {{^}}test_implicit_dyn: ; 36 prepended implicit bytes + 8(out pointer + in) = 44 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44 -define void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 { +define amdgpu_kernel void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 { %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr() %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)* %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 %in diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll index b7cc6d47cd87..fbdaeb829297 100644 --- a/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -9,7 +9,7 @@ ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -33,7 +33,7 @@ define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv fast float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -45,7 +45,7 @@ define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv arcp float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -57,7 +57,7 @@ define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 { +define amdgpu_kernel void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 { %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -69,7 +69,7 @@ define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) %rcp = fdiv float 1.0, %src.fabs store float %rcp, float addrspace(1)* %out, align 4 @@ -82,7 +82,7 @@ define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 { ; GCN: buffer_store_dword [[RCP]] ; EG: RECIP_IEEE -define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv float -1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void @@ -92,7 +92,7 @@ define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]| ; GCN: buffer_store_dword [[RCP]] -define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fsub float -0.0, %src.fabs %rcp = fdiv float 1.0, %src.fabs.fneg @@ -106,7 +106,7 @@ define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]| ; GCN: buffer_store_dword [[RCP]] ; GCN: buffer_store_dword [[MUL]] -define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 { +define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fsub float -0.0, %src.fabs %rcp = fdiv float 1.0, %src.fabs.fneg @@ -117,6 +117,35 @@ define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %sr ret void } +; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f32: +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} +; GCN: buffer_store_dword [[MUL]] +define amdgpu_kernel void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 { + %x = load float, float addrspace(1)* undef + %rcp = fdiv arcp float %x, 2.0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f32: +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0x3dcccccd, v{{[0-9]+}} +; GCN: buffer_store_dword [[MUL]] +define amdgpu_kernel void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 { + %x = load float, float addrspace(1)* undef + %rcp = fdiv arcp float %x, 10.0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f32: +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbdcccccd, v{{[0-9]+}} +; GCN: buffer_store_dword [[MUL]] +define amdgpu_kernel void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 { + %x = load float, float addrspace(1)* undef + %rcp = fdiv arcp float %x, -10.0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} declare float @llvm.fabs.f32(float) #1 declare float @llvm.sqrt.f32(float) #1 diff --git a/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll index a5581d73cb25..34cbe3963361 100644 --- a/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll +++ b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll @@ -4,7 +4,7 @@ declare i32 @llvm.read_register.i32(metadata) #0 -define void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind { store volatile i32 0, i32 addrspace(3)* undef %m0 = call i32 @llvm.read_register.i32(metadata !0) store i32 %m0, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll index 2617ad7402ff..6417d28e7aad 100644 --- a/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll +++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll @@ -4,7 +4,7 @@ declare i32 @llvm.read_register.i32(metadata) #0 -define void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind { +define amdgpu_kernel void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind { store volatile i32 0, i32 addrspace(3)* undef %m0 = call i32 @llvm.read_register.i32(metadata !0) store i32 %m0, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll index dcde8a1894fc..8e248fdfea4c 100644 --- a/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll +++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll @@ -4,7 +4,7 @@ declare i64 @llvm.read_register.i64(metadata) #0 -define void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 { %exec = call i64 @llvm.read_register.i64(metadata !0) store i64 %exec, i64 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/read_register.ll b/test/CodeGen/AMDGPU/read_register.ll index 601a0adb8122..8fe9e7f3f111 100644 --- a/test/CodeGen/AMDGPU/read_register.ll +++ b/test/CodeGen/AMDGPU/read_register.ll @@ -9,7 +9,7 @@ declare i64 @llvm.read_register.i64(metadata) #0 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], [[COPY_M0]] ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_m0(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_m0(i32 addrspace(1)* %out) #0 { store volatile i32 0, i32 addrspace(3)* undef %m0 = call i32 @llvm.read_register.i32(metadata !0) store i32 %m0, i32 addrspace(1)* %out @@ -20,7 +20,7 @@ define void @test_read_m0(i32 addrspace(1)* %out) #0 { ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_read_exec(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 { %exec = call i64 @llvm.read_register.i64(metadata !1) store i64 %exec, i64 addrspace(1)* %out ret void @@ -30,7 +30,7 @@ define void @test_read_exec(i64 addrspace(1)* %out) #0 { ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 { %flat_scratch = call i64 @llvm.read_register.i64(metadata !2) store i64 %flat_scratch, i64 addrspace(1)* %out ret void @@ -39,7 +39,7 @@ define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 { ; CHECK-LABEL: {{^}}test_read_flat_scratch_lo: ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_lo ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 { %flat_scratch_lo = call i32 @llvm.read_register.i32(metadata !3) store i32 %flat_scratch_lo, i32 addrspace(1)* %out ret void @@ -48,7 +48,7 @@ define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 { ; CHECK-LABEL: {{^}}test_read_flat_scratch_hi: ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_hi ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 { %flat_scratch_hi = call i32 @llvm.read_register.i32(metadata !4) store i32 %flat_scratch_hi, i32 addrspace(1)* %out ret void @@ -57,7 +57,7 @@ define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 { ; CHECK-LABEL: {{^}}test_read_exec_lo: ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_lo ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_exec_lo(i32 addrspace(1)* %out) #0 { %exec_lo = call i32 @llvm.read_register.i32(metadata !5) store i32 %exec_lo, i32 addrspace(1)* %out ret void @@ -66,7 +66,7 @@ define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 { ; CHECK-LABEL: {{^}}test_read_exec_hi: ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_hi ; CHECK: buffer_store_dword [[COPY]] -define void @test_read_exec_hi(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_read_exec_hi(i32 addrspace(1)* %out) #0 { %exec_hi = call i32 @llvm.read_register.i32(metadata !6) store i32 %exec_hi, i32 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/readcyclecounter.ll b/test/CodeGen/AMDGPU/readcyclecounter.ll index 7965b061fe5b..5c698c839fa6 100644 --- a/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -13,7 +13,7 @@ declare i64 @llvm.readcyclecounter() #0 ; SI: s_memtime s{{\[[0-9]+:[0-9]+\]}} ; VI: s_memrealtime s{{\[[0-9]+:[0-9]+\]}} ; GCN: store_dwordx2 -define void @test_readcyclecounter(i64 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 { %cycle0 = call i64 @llvm.readcyclecounter() store volatile i64 %cycle0, i64 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll index dd67dc488dbf..ecb513cd80b6 100644 --- a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll +++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll @@ -6,7 +6,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, [[VAL]] ; GCN: buffer_store_dwordx2 -define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %a = load i64, i64 addrspace(1)* %in, align 4 %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 @@ -16,7 +16,7 @@ define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 ad ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt0: ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: buffer_store_dword [[VAL]] -define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %a = load i64, i64 addrspace(1)* %in, align 4 %vec = bitcast i64 %a to <2 x i32> %elt0 = extractelement <2 x i32> %vec, i32 0 @@ -27,7 +27,7 @@ define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 a ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt1: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; GCN: buffer_store_dword [[VAL]] -define void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %a = load i64, i64 addrspace(1)* %in, align 4 %vec = bitcast i64 %a to <2 x i32> %elt0 = extractelement <2 x i32> %vec, i32 1 diff --git a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll index 281e49f804c6..601aca48e1e2 100644 --- a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll +++ b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4: ; GCN: s_load_dwordx2 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { +define amdgpu_kernel void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { %x.bc = bitcast <2 x i32> %x to <4 x i16> store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, ; GCN: s_load_dwordx4 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { +define amdgpu_kernel void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { %x.bc = bitcast <4 x i32> %x to <8 x i16> store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4 ret void @@ -22,7 +22,7 @@ define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, ; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4: ; GCN: s_load_dwordx2 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { +define amdgpu_kernel void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 { %x.bc = bitcast <2 x i32> %x to <4 x i16> store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4 ret void @@ -32,7 +32,7 @@ define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 ; GCN: s_load_dwordx4 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { +define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 { %x.bc = bitcast <4 x i32> %x to <2 x i64> store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4 ret void @@ -44,7 +44,7 @@ define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} -define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 { +define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 { %x.bc = bitcast <4 x i16> %x to <2 x i32> store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll index 909644850750..9f8667d35993 100644 --- a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll +++ b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 -define void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 { +define amdgpu_kernel void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 { bb: %id.x = call i32 @llvm.amdgcn.workitem.id.x() %cmp0 = icmp eq i32 %id.x, 0 diff --git a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir new file mode 100644 index 000000000000..ecf94b5772ff --- /dev/null +++ b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir @@ -0,0 +1,76 @@ +# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s + +# Test that register coalescing does not allow a call to +# LIS->getInstructionIndex with a DBG_VALUE instruction, which does not have +# a slot index. + +# CHECK: %13.sub2 = S_MOV_B32 0 +# CHECK: DBG_VALUE{{.*}}debug-use %13.sub2 + +--- | + define amdgpu_kernel void @test(i32 addrspace(1)* %out) { ret void } + + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4) + !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6) + !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !5) + !3 = !DISubroutineType(types: !4) + !4 = !{null} + !5 = !{!1} + !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 32) + !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) + !8 = !DIExpression() + !9 = !DILocation(line: 126, column: 9, scope: !2) + +... +--- +name: test +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: sreg_32_xm0_xexec } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_64 } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sgpr_64 } + - { id: 13, class: sgpr_128 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_64 } + - { id: 16, class: vgpr_32 } + - { id: 17, class: vreg_64 } + - { id: 18, class: vgpr_32 } + - { id: 19, class: vreg_64 } + - { id: 20, class: vreg_64 } +liveins: + - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } + - { reg: '%vgpr0', virtual-reg: '%3' } +body: | + bb.0: + liveins: %sgpr0_sgpr1, %vgpr0 + + %3 = COPY killed %vgpr0 + %0 = COPY killed %sgpr0_sgpr1 + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5 = S_LOAD_DWORD_IMM killed %0, 13, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`) + %18 = V_ASHRREV_I32_e32 31, %3, implicit %exec + undef %19.sub0 = COPY killed %3 + %19.sub1 = COPY killed %18 + %10 = S_MOV_B32 61440 + %11 = S_MOV_B32 0 + DBG_VALUE debug-use %11, debug-use _, !1, !8, debug-location !9 + undef %12.sub0 = COPY killed %11 + %12.sub1 = COPY killed %10 + undef %13.sub0_sub1 = COPY killed %4 + %13.sub2_sub3 = COPY killed %12 + %20 = V_LSHL_B64 killed %19, 2, implicit %exec + %16 = COPY killed %5 + BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out) + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll index bff3a9f5d2b0..26a76cf2041e 100644 --- a/test/CodeGen/AMDGPU/register-count-comments.ll +++ b/test/CodeGen/AMDGPU/register-count-comments.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 ; SI: ; Kernel info: ; SI: ; NumSgprs: {{[0-9]+}} ; SI: ; NumVgprs: {{[0-9]+}} -define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind { +define amdgpu_kernel void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind { %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0); %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid @@ -24,7 +24,7 @@ define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 a ; SI-LABEL: {{^}}one_vgpr_used: ; SI: NumVgprs: 1 -define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind { +define amdgpu_kernel void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind { store i32 %x, i32 addrspace(1)* %out, align 4 ret void } diff --git a/test/CodeGen/AMDGPU/rename-disconnected-bug.ll b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll index 47bdfba96530..5d4955aa1ce2 100644 --- a/test/CodeGen/AMDGPU/rename-disconnected-bug.ll +++ b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll @@ -3,7 +3,7 @@ ; definition on every path (there should at least be IMPLICIT_DEF instructions). target triple = "amdgcn--" -define void @func() { +define amdgpu_kernel void @func() { B0: br i1 undef, label %B1, label %B2 diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/test/CodeGen/AMDGPU/rename-independent-subregs.mir index b928bc7086bb..fc2e4426ba48 100644 --- a/test/CodeGen/AMDGPU/rename-independent-subregs.mir +++ b/test/CodeGen/AMDGPU/rename-independent-subregs.mir @@ -1,7 +1,7 @@ # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass simple-register-coalescing,rename-independent-subregs -o - %s | FileCheck %s --- | - define void @test0() { ret void } - define void @test1() { ret void } + define amdgpu_kernel void @test0() { ret void } + define amdgpu_kernel void @test1() { ret void } ... --- # In the test below we have two independent def+use pairs of subregister1 which diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll index 412202fa5d51..ff4069226a62 100644 --- a/test/CodeGen/AMDGPU/reorder-stores.ll +++ b/test/CodeGen/AMDGPU/reorder-stores.ll @@ -7,7 +7,7 @@ ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { +define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16 %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16 store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16 @@ -19,7 +19,7 @@ define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocap ; SI: ds_read2_b64 ; SI: ds_write2_b64 ; SI: s_endpgm -define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { +define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16 %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16 store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16 @@ -39,7 +39,7 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace ; SI: buffer_store_dwordx4 ; SI: buffer_store_dwordx4 ; SI: s_endpgm -define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { +define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32 %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32 store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32 @@ -54,7 +54,7 @@ define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* no ; SI-NOT: ds_read ; SI: ds_write_b64 ; SI: s_endpgm -define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind { +define amdgpu_kernel void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind { %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8 %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8 %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64> diff --git a/test/CodeGen/AMDGPU/ret.ll b/test/CodeGen/AMDGPU/ret.ll index 515203fad4cb..831c71dff79d 100644 --- a/test/CodeGen/AMDGPU/ret.ll +++ b/test/CodeGen/AMDGPU/ret.ll @@ -1,25 +1,24 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - ; GCN-LABEL: {{^}}vgpr: ; GCN: v_mov_b32_e32 v1, v0 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 -; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm +; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) - %x = fadd float %3, 1.0 - %a = insertvalue {float, float} undef, float %x, 0 - %b = insertvalue {float, float} %a, float %3, 1 - ret {float, float} %b +define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 + %x = fadd float %arg3, 1.000000e+00 + %a = insertvalue { float, float } undef, float %x, 0 + %b = insertvalue { float, float } %a, float %arg3, 1 + ret { float, float } %b } ; GCN-LABEL: {{^}}vgpr_literal: ; GCN: v_mov_b32_e32 v4, v0 -; GCN: exp mrt0 v4, v4, v4, v4 done compr vm +; GCN: exp mrt0 v4, v4, v4, v4 done vm ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: v_mov_b32_e32 v1, 2.0 @@ -27,12 +26,12 @@ define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 i ; GCN-DAG: v_mov_b32_e32 v3, -1.0 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) - ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0} +define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 + ret { float, float, float, float } { float 1.000000e+00, float 2.000000e+00, float 4.000000e+00, float -1.000000e+00 } } - ; GCN: .long 165580 ; GCN-NEXT: .long 562 ; GCN-NEXT: .long 165584 @@ -44,24 +43,24 @@ define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addr ; GCN: v_mov_b32_e32 v3, v4 ; GCN: v_mov_b32_e32 v4, v6 ; GCN-NOT: s_endpgm -define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { - %i0 = extractelement <2 x i32> %4, i32 0 - %i1 = extractelement <2 x i32> %4, i32 1 - %i2 = extractelement <2 x i32> %7, i32 0 - %i3 = extractelement <2 x i32> %8, i32 0 +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { +bb: + %i0 = extractelement <2 x i32> %arg4, i32 0 + %i1 = extractelement <2 x i32> %arg4, i32 1 + %i2 = extractelement <2 x i32> %arg7, i32 0 + %i3 = extractelement <2 x i32> %arg8, i32 0 %f0 = bitcast i32 %i0 to float %f1 = bitcast i32 %i1 to float %f2 = bitcast i32 %i2 to float %f3 = bitcast i32 %i3 to float - %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 - %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 - %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 - %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 - %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 - ret {float, float, float, float, float} %r4 + %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0 + %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1 + %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2 + %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3 + %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4 + ret { float, float, float, float, float } %r4 } - ; GCN: .long 165580 ; GCN-NEXT: .long 1 ; GCN-NEXT: .long 165584 @@ -69,11 +68,11 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i ; GCN-LABEL: {{^}}ps_input_ena_no_inputs: ; GCN: v_mov_b32_e32 v0, 1.0 ; GCN-NOT: s_endpgm -define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { - ret float 1.0 +define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { +bb: + ret float 1.000000e+00 } - ; GCN: .long 165580 ; GCN-NEXT: .long 2081 ; GCN-NEXT: .long 165584 @@ -83,14 +82,14 @@ define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byv ; GCN-DAG: v_mov_b32_e32 v1, v2 ; GCN: v_mov_b32_e32 v2, v3 ; GCN-NOT: s_endpgm -define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { - %f = bitcast <2 x i32> %8 to <2 x float> - %s = insertvalue {float, <2 x float>} undef, float %14, 0 - %s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1 - ret {float, <2 x float>} %s1 +define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { +bb: + %f = bitcast <2 x i32> %arg8 to <2 x float> + %s = insertvalue { float, <2 x float> } undef, float %arg14, 0 + %s1 = insertvalue { float, <2 x float> } %s, <2 x float> %f, 1 + ret { float, <2 x float> } %s1 } - ; GCN: .long 165580 ; GCN-NEXT: .long 562 ; GCN-NEXT: .long 165584 @@ -102,25 +101,24 @@ define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrsp ; GCN-DAG: v_mov_b32_e32 v3, v6 ; GCN-DAG: v_mov_b32_e32 v4, v8 ; GCN-NOT: s_endpgm -attributes #1 = { "InitialPSInputAddr"="1" } -define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 { - %i0 = extractelement <2 x i32> %4, i32 0 - %i1 = extractelement <2 x i32> %4, i32 1 - %i2 = extractelement <2 x i32> %7, i32 0 - %i3 = extractelement <2 x i32> %8, i32 0 +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 { +bb: + %i0 = extractelement <2 x i32> %arg4, i32 0 + %i1 = extractelement <2 x i32> %arg4, i32 1 + %i2 = extractelement <2 x i32> %arg7, i32 0 + %i3 = extractelement <2 x i32> %arg8, i32 0 %f0 = bitcast i32 %i0 to float %f1 = bitcast i32 %i1 to float %f2 = bitcast i32 %i2 to float %f3 = bitcast i32 %i3 to float - %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 - %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 - %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 - %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 - %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 - ret {float, float, float, float, float} %r4 + %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0 + %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1 + %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2 + %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3 + %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4 + ret { float, float, float, float, float } %r4 } - ; GCN: .long 165580 ; GCN-NEXT: .long 562 ; GCN-NEXT: .long 165584 @@ -132,25 +130,24 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i ; GCN: v_mov_b32_e32 v3, v8 ; GCN: v_mov_b32_e32 v4, v12 ; GCN-NOT: s_endpgm -attributes #2 = { "InitialPSInputAddr"="119" } -define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 { - %i0 = extractelement <2 x i32> %4, i32 0 - %i1 = extractelement <2 x i32> %4, i32 1 - %i2 = extractelement <2 x i32> %7, i32 0 - %i3 = extractelement <2 x i32> %8, i32 0 +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 { +bb: + %i0 = extractelement <2 x i32> %arg4, i32 0 + %i1 = extractelement <2 x i32> %arg4, i32 1 + %i2 = extractelement <2 x i32> %arg7, i32 0 + %i3 = extractelement <2 x i32> %arg8, i32 0 %f0 = bitcast i32 %i0 to float %f1 = bitcast i32 %i1 to float %f2 = bitcast i32 %i2 to float %f3 = bitcast i32 %i3 to float - %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 - %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 - %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 - %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 - %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 - ret {float, float, float, float, float} %r4 + %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0 + %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1 + %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2 + %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3 + %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4 + ret { float, float, float, float, float } %r4 } - ; GCN: .long 165580 ; GCN-NEXT: .long 562 ; GCN-NEXT: .long 165584 @@ -162,38 +159,37 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x ; GCN: v_mov_b32_e32 v3, v4 ; GCN: v_mov_b32_e32 v4, v8 ; GCN-NOT: s_endpgm -attributes #3 = { "InitialPSInputAddr"="418" } -define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 { - %i0 = extractelement <2 x i32> %4, i32 0 - %i1 = extractelement <2 x i32> %4, i32 1 - %i2 = extractelement <2 x i32> %7, i32 0 - %i3 = extractelement <2 x i32> %8, i32 0 +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 { +bb: + %i0 = extractelement <2 x i32> %arg4, i32 0 + %i1 = extractelement <2 x i32> %arg4, i32 1 + %i2 = extractelement <2 x i32> %arg7, i32 0 + %i3 = extractelement <2 x i32> %arg8, i32 0 %f0 = bitcast i32 %i0 to float %f1 = bitcast i32 %i1 to float %f2 = bitcast i32 %i2 to float %f3 = bitcast i32 %i3 to float - %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 - %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 - %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 - %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 - %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 - ret {float, float, float, float, float} %r4 + %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0 + %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1 + %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2 + %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3 + %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4 + ret { float, float, float, float, float } %r4 } - ; GCN-LABEL: {{^}}sgpr: ; GCN: s_add_i32 s0, s3, 2 ; GCN: s_mov_b32 s2, s3 ; GCN-NOT: s_endpgm -define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - %x = add i32 %2, 2 - %a = insertvalue {i32, i32, i32} undef, i32 %x, 0 - %b = insertvalue {i32, i32, i32} %a, i32 %1, 1 - %c = insertvalue {i32, i32, i32} %a, i32 %2, 2 - ret {i32, i32, i32} %c +define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + %x = add i32 %arg2, 2 + %a = insertvalue { i32, i32, i32 } undef, i32 %x, 0 + %b = insertvalue { i32, i32, i32 } %a, i32 %arg1, 1 + %c = insertvalue { i32, i32, i32 } %a, i32 %arg2, 2 + ret { i32, i32, i32 } %c } - ; GCN-LABEL: {{^}}sgpr_literal: ; GCN: s_mov_b32 s0, 5 ; GCN-NOT: s_mov_b32 s0, s0 @@ -201,37 +197,37 @@ define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 ; GCN-DAG: s_mov_b32 s2, 7 ; GCN-DAG: s_mov_b32 s3, 8 ; GCN-NOT: s_endpgm -define amdgpu_vs {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - %x = add i32 %2, 2 - ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8} +define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + %x = add i32 %arg2, 2 + ret { i32, i32, i32, i32 } { i32 5, i32 6, i32 7, i32 8 } } - ; GCN-LABEL: {{^}}both: ; GCN: v_mov_b32_e32 v1, v0 -; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm +; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 ; GCN-DAG: s_add_i32 s0, s3, 2 ; GCN-DAG: s_mov_b32 s1, s2 ; GCN: s_mov_b32 s2, s3 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) - %v = fadd float %3, 1.0 - %s = add i32 %2, 2 - %a0 = insertvalue {float, i32, float, i32, i32} undef, float %v, 0 - %a1 = insertvalue {float, i32, float, i32, i32} %a0, i32 %s, 1 - %a2 = insertvalue {float, i32, float, i32, i32} %a1, float %3, 2 - %a3 = insertvalue {float, i32, float, i32, i32} %a2, i32 %1, 3 - %a4 = insertvalue {float, i32, float, i32, i32} %a3, i32 %2, 4 - ret {float, i32, float, i32, i32} %a4 +define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 + %v = fadd float %arg3, 1.000000e+00 + %s = add i32 %arg2, 2 + %a0 = insertvalue { float, i32, float, i32, i32 } undef, float %v, 0 + %a1 = insertvalue { float, i32, float, i32, i32 } %a0, i32 %s, 1 + %a2 = insertvalue { float, i32, float, i32, i32 } %a1, float %arg3, 2 + %a3 = insertvalue { float, i32, float, i32, i32 } %a2, i32 %arg1, 3 + %a4 = insertvalue { float, i32, float, i32, i32 } %a3, i32 %arg2, 4 + ret { float, i32, float, i32, i32 } %a4 } - ; GCN-LABEL: {{^}}structure_literal: ; GCN: v_mov_b32_e32 v3, v0 -; GCN: exp mrt0 v3, v3, v3, v3 done compr vm +; GCN: exp mrt0 v3, v3, v3, v3 done vm ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: s_mov_b32 s0, 2 @@ -239,9 +235,16 @@ define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2 ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 ; GCN: s_waitcnt expcnt(0) -define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) - ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> }} +define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 + ret { { float, i32 }, { i32, <2 x float> } } { { float, i32 } { float 1.000000e+00, i32 2 }, { i32, <2 x float> } { i32 3, <2 x float> } } } -attributes #0 = { nounwind "InitialPSInputAddr"="0" } +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind "InitialPSInputAddr"="0" } +attributes #2 = { nounwind "InitialPSInputAddr"="1" } +attributes #3 = { nounwind "InitialPSInputAddr"="119" } +attributes #4 = { nounwind "InitialPSInputAddr"="418" } diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll index 51ca60492414..f2fbacbab82e 100644 --- a/test/CodeGen/AMDGPU/ret_jump.ll +++ b/test/CodeGen/AMDGPU/ret_jump.ll @@ -4,24 +4,86 @@ ; This should end with an no-op sequence of exec mask manipulations ; Mask should be in original state after executed unreachable block -; GCN-LABEL: {{^}}main: + +; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable: ; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]] +; GCN-NEXT: ; %else + ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]] -; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]] +; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] -; GCN: [[RET_BB]]: -; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]] +; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb +; GCN-NEXT: ; divergent unreachable -; GCN-NEXT: [[UNREACHABLE_BB]]: -; GCN-NEXT: s_or_b64 exec, exec, [[XOR_EXEC]] -; GCN-NEXT: [[FINAL_BB]]: +; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow +; GCN-NEXT: s_or_b64 exec, exec + +; GCN-NEXT: [[RET_BB]]: +; GCN-NEXT: ; return ; GCN-NEXT: .Lfunc_end0 -define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 { +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { +entry: + %i.i = extractelement <2 x i32> %arg7, i32 0 + %j.i = extractelement <2 x i32> %arg7, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2 + %p87 = fmul float undef, %p2.i + %p88 = fadd float %p87, undef + %p93 = fadd float %p88, undef + %p97 = fmul float %p93, undef + %p102 = fsub float %p97, undef + %p104 = fmul float %p102, undef + %p106 = fadd float 0.000000e+00, %p104 + %p108 = fadd float undef, %p106 + %uniform.cond = icmp slt i32 %arg17, 0 + br i1 %uniform.cond, label %ret.bb, label %else + +else: ; preds = %main_body + %p124 = fmul float %p108, %p108 + %p125 = fsub float %p124, undef + %divergent.cond = fcmp olt float %p125, 0.000000e+00 + br i1 %divergent.cond, label %ret.bb, label %unreachable.bb + +unreachable.bb: ; preds = %else + unreachable + +ret.bb: ; preds = %else, %main_body + ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef +} + +; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable: +; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]] + +; GCN: ; BB#{{[0-9]+}}: ; %else +; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc +; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]] +; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: ; %unreachable.bb +; GCN: ds_write_b32 +; GCN: s_waitcnt +; GCN: ; divergent unreachable + +; GCN: ; %ret.bb +; GCN: store_dword + +; GCN: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: ; return +; GCN-NEXT: .Lfunc_end +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { main_body: - %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) - %p87 = fmul float undef, %p83 + %i.i = extractelement <2 x i32> %arg7, i32 0 + %j.i = extractelement <2 x i32> %arg7, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2 + %p87 = fmul float undef, %p2.i %p88 = fadd float %p87, undef %p93 = fadd float %p88, undef %p97 = fmul float %p93, undef @@ -29,26 +91,35 @@ main_body: %p104 = fmul float %p102, undef %p106 = fadd float 0.000000e+00, %p104 %p108 = fadd float undef, %p106 - br i1 undef, label %ENDIF69, label %ELSE + %uniform.cond = icmp slt i32 %arg18, 0 + br i1 %uniform.cond, label %ret.bb, label %else -ELSE: ; preds = %main_body +else: ; preds = %main_body %p124 = fmul float %p108, %p108 %p125 = fsub float %p124, undef - %p126 = fcmp olt float %p125, 0.000000e+00 - br i1 %p126, label %ENDIF69, label %ELSE41 + %divergent.cond = fcmp olt float %p125, 0.000000e+00 + br i1 %divergent.cond, label %ret.bb, label %unreachable.bb -ELSE41: ; preds = %ELSE +unreachable.bb: ; preds = %else + store volatile i32 8, i32 addrspace(3)* undef unreachable -ENDIF69: ; preds = %ELSE, %main_body +ret.bb: ; preds = %else, %main_body + store volatile i32 11, i32 addrspace(1)* undef ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef } ; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 ; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 ; Function Attrs: nounwind readnone declare float @llvm.fabs.f32(float) #1 @@ -61,3 +132,4 @@ declare float @llvm.floor.f32(float) #1 attributes #0 = { "InitialPSInputAddr"="36983" } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind } diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll index b60c470de97c..266490718dd1 100644 --- a/test/CodeGen/AMDGPU/rotl.i64.ll +++ b/test/CodeGen/AMDGPU/rotl.i64.ll @@ -7,7 +7,7 @@ ; BOTH-DAG: s_lshr_b64 ; BOTH: s_or_b64 ; BOTH: s_endpgm -define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { +define amdgpu_kernel void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { entry: %0 = shl i64 %x, %y %1 = sub i64 64, %y @@ -26,7 +26,7 @@ entry: ; BOTH: v_or_b32 ; BOTH: v_or_b32 ; BOTH: s_endpgm -define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { +define amdgpu_kernel void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { entry: %x = load i64, i64 addrspace(1)* %xptr, align 8 %y = load i64, i64 addrspace(1)* %yptr, align 8 diff --git a/test/CodeGen/AMDGPU/rotl.ll b/test/CodeGen/AMDGPU/rotl.ll index 7d2b5538ca33..c4bc8cdaabf5 100644 --- a/test/CodeGen/AMDGPU/rotl.ll +++ b/test/CodeGen/AMDGPU/rotl.ll @@ -10,7 +10,7 @@ ; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}} ; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]] ; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]] -define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { +define amdgpu_kernel void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { entry: %0 = shl i32 %x, %y %1 = sub i32 32, %y @@ -26,7 +26,7 @@ entry: ; SI-DAG: v_alignbit_b32 ; SI-DAG: v_alignbit_b32 ; SI: s_endpgm -define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { +define amdgpu_kernel void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { entry: %0 = shl <2 x i32> %x, %y %1 = sub <2 x i32> , %y @@ -46,7 +46,7 @@ entry: ; SI-DAG: s_sub_i32 ; SI-DAG: v_alignbit_b32 ; SI: s_endpgm -define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { +define amdgpu_kernel void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { entry: %0 = shl <4 x i32> %x, %y %1 = sub <4 x i32> , %y diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll index 58a1efe08079..9eda479cd25c 100644 --- a/test/CodeGen/AMDGPU/rotr.i64.ll +++ b/test/CodeGen/AMDGPU/rotr.i64.ll @@ -6,7 +6,7 @@ ; BOTH-DAG: s_lshr_b64 ; BOTH-DAG: s_lshl_b64 ; BOTH: s_or_b64 -define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { +define amdgpu_kernel void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { entry: %tmp0 = sub i64 64, %y %tmp1 = shl i64 %x, %tmp0 @@ -24,7 +24,7 @@ entry: ; VI-DAG: v_lshlrev_b64 ; BOTH: v_or_b32 ; BOTH: v_or_b32 -define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { +define amdgpu_kernel void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { entry: %x = load i64, i64 addrspace(1)* %xptr, align 8 %y = load i64, i64 addrspace(1)* %yptr, align 8 @@ -37,7 +37,7 @@ entry: } ; BOTH-LABEL: {{^}}s_rotr_v2i64: -define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) { +define amdgpu_kernel void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) { entry: %tmp0 = sub <2 x i64> , %y %tmp1 = shl <2 x i64> %x, %tmp0 @@ -48,7 +48,7 @@ entry: } ; BOTH-LABEL: {{^}}v_rotr_v2i64: -define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) { +define amdgpu_kernel void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) { entry: %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8 %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8 diff --git a/test/CodeGen/AMDGPU/rotr.ll b/test/CodeGen/AMDGPU/rotr.ll index 55d180077cc7..b4e2c2b67ce1 100644 --- a/test/CodeGen/AMDGPU/rotr.ll +++ b/test/CodeGen/AMDGPU/rotr.ll @@ -6,7 +6,7 @@ ; R600: BIT_ALIGN_INT ; SI: v_alignbit_b32 -define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { +define amdgpu_kernel void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { entry: %tmp0 = sub i32 32, %y %tmp1 = shl i32 %x, %tmp0 @@ -22,7 +22,7 @@ entry: ; SI: v_alignbit_b32 ; SI: v_alignbit_b32 -define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { +define amdgpu_kernel void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { entry: %tmp0 = sub <2 x i32> , %y %tmp1 = shl <2 x i32> %x, %tmp0 @@ -42,7 +42,7 @@ entry: ; SI: v_alignbit_b32 ; SI: v_alignbit_b32 ; SI: v_alignbit_b32 -define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { +define amdgpu_kernel void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { entry: %tmp0 = sub <4 x i32> , %y %tmp1 = shl <4 x i32> %x, %tmp0 diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll index 699440c3efbf..9462683efe0e 100644 --- a/test/CodeGen/AMDGPU/rsq.ll +++ b/test/CodeGen/AMDGPU/rsq.ll @@ -8,7 +8,7 @@ declare double @llvm.sqrt.f64(double) nounwind readnone ; SI-LABEL: {{^}}rsq_f32: ; SI: v_rsq_f32_e32 ; SI: s_endpgm -define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone %div = fdiv float 1.0, %sqrt @@ -20,7 +20,7 @@ define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noali ; SI-UNSAFE: v_rsq_f64_e32 ; SI-SAFE: v_sqrt_f64_e32 ; SI: s_endpgm -define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { %val = load double, double addrspace(1)* %in, align 4 %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone %div = fdiv double 1.0, %sqrt @@ -31,7 +31,7 @@ define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noa ; SI-LABEL: {{^}}rsq_f32_sgpr: ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} ; SI: s_endpgm -define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { +define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone %div = fdiv float 1.0, %sqrt store float %div, float addrspace(1)* %out, align 4 @@ -55,7 +55,7 @@ define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind ; SI-SAFE-NOT: v_rsq_f32 ; SI: s_endpgm -define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -81,7 +81,7 @@ define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { ; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}} ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]] ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]] -define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %sqrt = call float @llvm.sqrt.f32(float %val) %div = fdiv float -1.0, %sqrt @@ -96,7 +96,7 @@ define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* n ; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}} ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]] ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]] -define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { %val = load double, double addrspace(1)* %in, align 4 %sqrt = call double @llvm.sqrt.f64(double %val) %div = fdiv double -1.0, %sqrt @@ -112,7 +112,7 @@ define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* ; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}} ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]] ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]] -define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float, float addrspace(1)* %in, align 4 %val.fneg = fsub float -0.0, %val %sqrt = call float @llvm.sqrt.f32(float %val.fneg) @@ -128,7 +128,7 @@ define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1 ; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}} ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]] ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]] -define void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { %val = load double, double addrspace(1)* %in, align 4 %val.fneg = fsub double -0.0, %val %sqrt = call double @llvm.sqrt.f64(double %val.fneg) diff --git a/test/CodeGen/AMDGPU/runtime-metadata.ll b/test/CodeGen/AMDGPU/runtime-metadata.ll deleted file mode 100644 index abdbc325fd4d..000000000000 --- a/test/CodeGen/AMDGPU/runtime-metadata.ll +++ /dev/null @@ -1,396 +0,0 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -amdgpu-dump-rtmd -amdgpu-check-rtmd-parser %s -o - 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=PARSER %s - -%struct.A = type { i8, float } -%opencl.image1d_t = type opaque -%opencl.image2d_t = type opaque -%opencl.image3d_t = type opaque -%opencl.queue_t = type opaque -%opencl.pipe_t = type opaque -%struct.B = type { i32 addrspace(1)*} -%opencl.clk_event_t = type opaque - -; CHECK: --- -; CHECK-NEXT: { amd.MDVersion: [ 2, 0 ], amd.PrintfInfo: [ '1:1:4:%d\n', '2:1:8:%g\n' ], amd.Kernels: - -; CHECK-NEXT: - { amd.KernelName: test_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 1, amd.ArgAlign: 1, amd.ArgKind: 0, amd.ArgValueType: 1, amd.ArgTypeName: char, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_char(i8 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_ushort2, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 4, amd.ArgTypeName: ushort2, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_ushort2(<2 x i16> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_int3, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 16, amd.ArgAlign: 16, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int3, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_int3(<3 x i32> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_ulong4, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 32, amd.ArgAlign: 32, amd.ArgKind: 0, amd.ArgValueType: 10, amd.ArgTypeName: ulong4, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_ulong4(<4 x i64> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_half8, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 16, amd.ArgAlign: 16, amd.ArgKind: 0, amd.ArgValueType: 5, amd.ArgTypeName: half8, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_half8(<8 x half> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_float16, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 64, amd.ArgAlign: 64, amd.ArgKind: 0, amd.ArgValueType: 8, amd.ArgTypeName: float16, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_float16(<16 x float> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_double16, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 128, amd.ArgAlign: 128, amd.ArgKind: 0, amd.ArgValueType: 11, amd.ArgTypeName: double16, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_double16(<16 x double> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_pointer, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_image, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image2d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17 !kernel_arg_base_type !17 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_sampler, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 3, amd.ArgValueType: 6, amd.ArgTypeName: sampler_t, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_sampler(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_queue, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 6, amd.ArgValueType: 0, amd.ArgTypeName: queue_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_struct, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 1, amd.ArgValueType: 0, amd.ArgTypeName: struct A, amd.ArgAddrQual: 0, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_struct(%struct.A* byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_i128, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 16, amd.ArgAlign: 8, amd.ArgKind: 0, amd.ArgValueType: 0, amd.ArgTypeName: i128, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_i128(i128 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_multi_arg, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 3, amd.ArgTypeName: short2, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 1, amd.ArgTypeName: char3, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24 !kernel_arg_base_type !24 !kernel_arg_type_qual !25 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_addr_space, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 2, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 4, amd.ArgKind: 2, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g, i32 addrspace(2)* %c, i32 addrspace(3)* %l) !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !25 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_type_qual, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0, amd.ArgIsVolatile: 1 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0, amd.ArgIsConst: 1, amd.ArgIsRestrict: 1 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 5, amd.ArgValueType: 0, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0, amd.ArgIsPipe: 1 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a, i32 addrspace(1)* %b, %opencl.pipe_t addrspace(1)* %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !70 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_access_qual, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image1d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 1 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image2d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 2 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image3d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 3 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro, %opencl.image2d_t addrspace(1)* %wo, %opencl.image3d_t addrspace(1)* %rw) !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62 !kernel_arg_base_type !62 !kernel_arg_type_qual !25 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_vec_type_hint_half, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: half, amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_vec_type_hint_half(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !26 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_vec_type_hint_float, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: float, amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_vec_type_hint_float(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !27 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_vec_type_hint_double, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: double, amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_vec_type_hint_double(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !28 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_vec_type_hint_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: char, amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_vec_type_hint_char(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !29 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_vec_type_hint_short, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: short, amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_vec_type_hint_short(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !30 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_vec_type_hint_long, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: long, amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_vec_type_hint_long(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !31 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_vec_type_hint_unknown, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: unknown, amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_vec_type_hint_unknown(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !32 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_reqd_wgs_vec_type_hint, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.ReqdWorkGroupSize: [ 1, 2, 4 ], amd.VecTypeHint: int, amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5 !reqd_work_group_size !6 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_wgs_hint_vec_type_hint, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.WorkGroupSizeHint: [ 8, 16, 32 ], amd.VecTypeHint: uint4, amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7 !work_group_size_hint !8 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_arg_ptr_to_ptr, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int **', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_arg_ptr_to_ptr(i32 * addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80 !kernel_arg_base_type !80 !kernel_arg_type_qual !4 { - ret void -} -; CHECK-NEXT: - { amd.KernelName: test_arg_struct_contains_ptr, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 1, amd.ArgValueType: 0, amd.ArgTypeName: struct B, amd.ArgAddrQual: 0, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B * byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82 !kernel_arg_base_type !82 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_arg_vector_of_ptr, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 16, amd.ArgAlign: 16, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: 'global int* __attribute__((ext_vector_type(2)))', amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83 !kernel_arg_base_type !83 !kernel_arg_type_qual !4 { - ret void -} - - -; CHECK-NEXT: - { amd.KernelName: test_arg_unknown_builtin_type, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 0, amd.ArgTypeName: clk_event_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } -define amdgpu_kernel void @test_arg_unknown_builtin_type(%opencl.clk_event_t addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84 !kernel_arg_base_type !84 !kernel_arg_type_qual !4 { - ret void -} - -; CHECK-NEXT: - { amd.KernelName: test_pointee_align, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 9, amd.ArgTypeName: 'long *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 1, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 2, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char2 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 4, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char3 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 4, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char4 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 8, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char8 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 16, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char16 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 } -; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } } -define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, i8 addrspace(3)* %b, <2 x i8> addrspace(3)* %c, <3 x i8> addrspace(3)* %d, <4 x i8> addrspace(3)* %e, <8 x i8> addrspace(3)* %f, <16 x i8> addrspace(3)* %g) !kernel_arg_addr_space !91 !kernel_arg_access_qual !92 !kernel_arg_type !93 !kernel_arg_base_type !93 !kernel_arg_type_qual !94 { - ret void -} - -; CHECK-NEXT:... - -; PARSER: AMDGPU runtime metadata parser test passes. - -!llvm.printf.fmts = !{!100, !101} - -!1 = !{i32 0} -!2 = !{!"none"} -!3 = !{!"int"} -!4 = !{!""} -!5 = !{i32 undef, i32 1} -!6 = !{i32 1, i32 2, i32 4} -!7 = !{<4 x i32> undef, i32 0} -!8 = !{i32 8, i32 16, i32 32} -!9 = !{!"char"} -!10 = !{!"ushort2"} -!11 = !{!"int3"} -!12 = !{!"ulong4"} -!13 = !{!"half8"} -!14 = !{!"float16"} -!15 = !{!"double16"} -!16 = !{!"int *"} -!17 = !{!"image2d_t"} -!18 = !{!"sampler_t"} -!19 = !{!"queue_t"} -!20 = !{!"struct A"} -!21 = !{!"i128"} -!22 = !{i32 0, i32 0, i32 0} -!23 = !{!"none", !"none", !"none"} -!24 = !{!"int", !"short2", !"char3"} -!25 = !{!"", !"", !""} -!26 = !{half undef, i32 1} -!27 = !{float undef, i32 1} -!28 = !{double undef, i32 1} -!29 = !{i8 undef, i32 1} -!30 = !{i16 undef, i32 1} -!31 = !{i64 undef, i32 1} -!32 = !{i32 *undef, i32 1} -!50 = !{i32 1, i32 2, i32 3} -!51 = !{!"int *", !"int *", !"int *"} -!60 = !{i32 1, i32 1, i32 1} -!61 = !{!"read_only", !"write_only", !"read_write"} -!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"} -!70 = !{!"volatile", !"const restrict", !"pipe"} -!80 = !{!"int **"} -!81 = !{i32 1} -!82 = !{!"struct B"} -!83 = !{!"global int* __attribute__((ext_vector_type(2)))"} -!84 = !{!"clk_event_t"} -!opencl.ocl.version = !{!90} -!90 = !{i32 2, i32 0} -!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3} -!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"} -!93 = !{!"long *", !"char *", !"char2 *", !"char3 *", !"char4 *", !"char8 *", !"char16 *"} -!94 = !{!"", !"", !"", !"", !"", !"", !""} -!100 = !{!"1:1:4:%d\5Cn"} -!101 = !{!"2:1:8:%g\5Cn"} diff --git a/test/CodeGen/AMDGPU/s_addk_i32.ll b/test/CodeGen/AMDGPU/s_addk_i32.ll index f776faca8397..deef24cea377 100644 --- a/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -7,7 +7,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[VRESULT]] ; SI: s_endpgm -define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { %add = add i32 %b, 65 store i32 %add, i32 addrspace(1)* %out ret void @@ -19,7 +19,7 @@ define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] ; SI: s_endpgm -define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) { %add0 = add i32 %a, 65 %add1 = add i32 %b, 65 store i32 %add0, i32 addrspace(1)* %out0 @@ -30,26 +30,35 @@ define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, ; SI-LABEL: {{^}}s_addk_i32_k1: ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}} ; SI: s_endpgm -define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) { %add = add i32 %b, 32767 ; (1 << 15) - 1 store i32 %add, i32 addrspace(1)* %out ret void } ; SI-LABEL: {{^}}s_addk_i32_k2: -; SI: s_addk_i32 {{s[0-9]+}}, 0xffef{{$}} +; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17 ; SI: s_endpgm -define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) { %add = add i32 %b, -17 store i32 %add, i32 addrspace(1)* %out ret void } +; SI-LABEL: {{^}}s_addk_i32_k3: +; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}} +; SI: s_endpgm +define amdgpu_kernel void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) { + %add = add i32 %b, -65 + store i32 %add, i32 addrspace(1)* %out + ret void +} + ; SI-LABEL: {{^}}s_addk_v2i32_k0: ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 ; SI: s_endpgm -define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) { +define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) { %add = add <2 x i32> %b, store <2 x i32> %add, <2 x i32> addrspace(1)* %out ret void @@ -61,7 +70,7 @@ define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) { ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44 ; SI: s_endpgm -define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) { +define amdgpu_kernel void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) { %add = add <4 x i32> %b, store <4 x i32> %add, <4 x i32> addrspace(1)* %out ret void @@ -77,7 +86,7 @@ define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) { ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48 ; SI: s_endpgm -define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) { +define amdgpu_kernel void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) { %add = add <8 x i32> %b, store <8 x i32> %add, <8 x i32> addrspace(1)* %out ret void @@ -86,7 +95,7 @@ define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) { ; SI-LABEL: {{^}}no_s_addk_i32_k0: ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}} ; SI: s_endpgm -define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { %add = add i32 %b, 32768 ; 1 << 15 store i32 %add, i32 addrspace(1)* %out ret void @@ -96,7 +105,7 @@ define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { ; SI-LABEL: {{^}}commute_s_addk_i32: ; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 { +define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %add = add i32 %size, %b call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add) diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll index 0164c45083a2..a131aaa3dfb4 100644 --- a/test/CodeGen/AMDGPU/s_movk_i32.ll +++ b/test/CodeGen/AMDGPU/s_movk_i32.ll @@ -7,7 +7,7 @@ ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32) store i64 %or, i64 addrspace(1)* %out @@ -21,7 +21,7 @@ define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32) store i64 %or, i64 addrspace(1)* %out @@ -35,7 +35,7 @@ define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32) store i64 %or, i64 addrspace(1)* %out @@ -49,7 +49,7 @@ define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32) store i64 %or, i64 addrspace(1)* %out @@ -63,7 +63,7 @@ define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32) store i64 %or, i64 addrspace(1)* %out @@ -78,7 +78,7 @@ define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff store i64 %or, i64 addrspace(1)* %out @@ -92,7 +92,7 @@ define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 270582939713 ; 65 | (63 << 32) store i64 %or, i64 addrspace(1)* %out @@ -107,7 +107,7 @@ define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32) store i64 %or, i64 addrspace(1)* %out @@ -122,7 +122,7 @@ define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000 store i64 %or, i64 addrspace(1)* %out @@ -137,7 +137,7 @@ define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001 store i64 %or, i64 addrspace(1)* %out @@ -152,7 +152,7 @@ define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888 store i64 %or, i64 addrspace(1)* %out @@ -167,7 +167,7 @@ define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff store i64 %or, i64 addrspace(1)* %out @@ -182,7 +182,7 @@ define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] ; SI: s_endpgm -define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001 store i64 %or, i64 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/s_mulk_i32.ll b/test/CodeGen/AMDGPU/s_mulk_i32.ll index e83b368cc1cb..f6ed5408ba45 100644 --- a/test/CodeGen/AMDGPU/s_mulk_i32.ll +++ b/test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -7,7 +7,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[VRESULT]] ; SI: s_endpgm -define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { %mul = mul i32 %b, 65 store i32 %mul, i32 addrspace(1)* %out ret void @@ -16,7 +16,7 @@ define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { ; SI-LABEL: {{^}}s_mulk_i32_k1: ; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}} ; SI: s_endpgm -define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) { %mul = mul i32 %b, 32767 ; (1 << 15) - 1 store i32 %mul, i32 addrspace(1)* %out ret void @@ -25,7 +25,7 @@ define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) { ; SI-LABEL: {{^}}s_mulk_i32_k2: ; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}} ; SI: s_endpgm -define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) { %mul = mul i32 %b, -17 store i32 %mul, i32 addrspace(1)* %out ret void @@ -34,7 +34,7 @@ define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) { ; SI-LABEL: {{^}}no_s_mulk_i32_k0: ; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}} ; SI: s_endpgm -define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { +define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { %mul = mul i32 %b, 32769 ; 1 << 15 + 1 store i32 %mul, i32 addrspace(1)* %out ret void @@ -44,7 +44,7 @@ define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { ; SI-LABEL: {{^}}commute_s_mulk_i32: ; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 { +define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 { %size = call i32 @llvm.amdgcn.groupstaticsize() %add = mul i32 %size, %b call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add) diff --git a/test/CodeGen/AMDGPU/sad.ll b/test/CodeGen/AMDGPU/sad.ll index 534483401638..f7a1c65881d0 100644 --- a/test/CodeGen/AMDGPU/sad.ll +++ b/test/CodeGen/AMDGPU/sad.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -18,7 +18,7 @@ define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: {{^}}v_sad_u32_constant_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20 -define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) { %icmp0 = icmp ugt i32 %a, 90 %t0 = select i1 %icmp0, i32 %a, i32 90 @@ -34,7 +34,7 @@ define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) { ; GCN-LABEL: {{^}}v_sad_u32_pat2: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b %sub1 = sub i32 %b, %a @@ -51,7 +51,7 @@ define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { ; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -68,7 +68,7 @@ define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b ; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -84,7 +84,7 @@ define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b ; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b store volatile i32 %t0, i32 *undef @@ -101,7 +101,7 @@ define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b ; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -119,7 +119,7 @@ define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b ; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b store volatile i32 %sub0, i32 *undef @@ -136,7 +136,7 @@ define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b %sub1 = sub i32 %b, %a @@ -154,7 +154,7 @@ define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +define amdgpu_kernel void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { %icmp0 = icmp ugt <4 x i32> %a, %b %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b @@ -173,7 +173,7 @@ define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, < ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +define amdgpu_kernel void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { %icmp0 = icmp ugt <4 x i32> %a, %b %sub0 = sub <4 x i32> %a, %b %sub1 = sub <4 x i32> %b, %a @@ -187,7 +187,7 @@ define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, < ; GCN-LABEL: {{^}}v_sad_u32_i16_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { +define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { %icmp0 = icmp ugt i16 %a, %b %t0 = select i1 %icmp0, i16 %a, i16 %b @@ -204,7 +204,7 @@ define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) { +define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) { %icmp0 = icmp ugt i16 %a, %b %sub0 = sub i16 %a, %b %sub1 = sub i16 %b, %a @@ -218,7 +218,7 @@ define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zero ; GCN-LABEL: {{^}}v_sad_u32_i8_pat1: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { +define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { %icmp0 = icmp ugt i8 %a, %b %t0 = select i1 %icmp0, i8 %a, i8 %b @@ -234,7 +234,7 @@ define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2: ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { +define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { %icmp0 = icmp ugt i8 %a, %b %sub0 = sub i8 %a, %b %sub1 = sub i8 %b, %a @@ -251,7 +251,7 @@ define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -269,7 +269,7 @@ define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} -define void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %d %sub1 = sub i32 %b, %a diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll index f8ced7942a60..586a455b2b91 100644 --- a/test/CodeGen/AMDGPU/saddo.ll +++ b/test/CodeGen/AMDGPU/saddo.ll @@ -6,7 +6,7 @@ declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone ; FUNC-LABEL: {{^}}saddo_i64_zext: -define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 %carry = extractvalue { i64, i1 } %sadd, 1 @@ -17,7 +17,7 @@ define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { } ; FUNC-LABEL: {{^}}s_saddo_i32: -define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %sadd, 0 %carry = extractvalue { i32, i1 } %sadd, 1 @@ -27,7 +27,7 @@ define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 } ; FUNC-LABEL: {{^}}v_saddo_i32: -define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -39,7 +39,7 @@ define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 } ; FUNC-LABEL: {{^}}s_saddo_i64: -define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 %carry = extractvalue { i64, i1 } %sadd, 1 @@ -51,7 +51,7 @@ define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 ; FUNC-LABEL: {{^}}v_saddo_i64: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll index 37083fbbd3c5..6e1dd1638333 100644 --- a/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -24,7 +24,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() #0 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}} ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}} -define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { +define amdgpu_kernel void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = call i32 @llvm.amdgcn.workitem.id.y() @@ -55,17 +55,17 @@ done: ; preds = %loop ; GCN-LABEL: {{^}}smrd_valu: ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0 +; SI: s_mov_b32 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} ; SI: s_nop 3 ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]] -; SI: s_mov_b32 ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]] ; GCN-NOHSA: buffer_store_dword [[V_OUT]] ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]] -define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { entry: %tmp = icmp ne i32 %a, 0 br i1 %tmp, label %if, label %else @@ -93,7 +93,7 @@ endif: ; preds = %else, %if ; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}} ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 { +define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 @@ -113,7 +113,7 @@ entry: ; GCN-NOHSA: buffer_store_dword ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} -define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp @@ -133,7 +133,7 @@ entry: ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: buffer_store_dwordx2 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp @@ -155,7 +155,7 @@ entry: ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp @@ -189,7 +189,7 @@ entry: ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp @@ -230,7 +230,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN: s_endpgm -define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp @@ -247,7 +247,7 @@ entry: ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]] ; GCN-NOHSA: buffer_store_dword [[ADD]] ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]] -define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 { +define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 @@ -261,7 +261,7 @@ entry: ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset: ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}} ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}} -define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { +define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 @@ -275,7 +275,7 @@ entry: ; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}} ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}] -define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { +define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 @@ -290,7 +290,7 @@ entry: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 @@ -313,7 +313,7 @@ entry: ; GCN-NOHSA: buffer_store_dword ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 @@ -350,7 +350,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 @@ -385,7 +385,7 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 @@ -439,9 +439,9 @@ entry: ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN-NOHSA: buffer_store_dword [[ONE]] ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]] -; GCN; {{^}}[[EXIT]]: +; GCN: {{^}}[[EXIT]]: ; GCN: s_endpgm -define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) { bb3: ; preds = %bb2 %tmp0 = bitcast i32 %cond to float %tmp1 = fadd float %tmp0, 2.500000e-01 @@ -459,7 +459,7 @@ bb7: ; preds = %bb3 ; GCN-LABEL: {{^}}phi_visit_order: ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}} -define void @phi_visit_order() { +define amdgpu_kernel void @phi_visit_order() { bb: br label %bb1 @@ -484,7 +484,7 @@ bb4: ; GCN: [[LOOP_LABEL:[0-9a-zA-Z_]+]]: ; GCN: s_xor_b32 [[B]], [[B]], [[A]] ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]] -define void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) { +define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) { entry: br label %loop diff --git a/test/CodeGen/AMDGPU/sampler-resource-id.ll b/test/CodeGen/AMDGPU/sampler-resource-id.ll index c41d345369bf..4ea503bf6098 100644 --- a/test/CodeGen/AMDGPU/sampler-resource-id.ll +++ b/test/CodeGen/AMDGPU/sampler-resource-id.ll @@ -5,7 +5,7 @@ ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 0( -define void @test_0(i32 %in0, i32 addrspace(1)* %out) { +define amdgpu_kernel void @test_0(i32 %in0, i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in0) #0 store i32 %0, i32 addrspace(1)* %out @@ -17,7 +17,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 1( -define void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) { +define amdgpu_kernel void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in1) #0 store i32 %0, i32 addrspace(1)* %out @@ -29,7 +29,7 @@ entry: ; EG: MOV [[VAL]], literal.x ; EG-NEXT: LSHR ; EG-NEXT: 2( -define void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) { +define amdgpu_kernel void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in2) #0 store i32 %0, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir b/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir index af71086e542f..5bee36d878eb 100644 --- a/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir +++ b/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir @@ -1,23 +1,23 @@ # RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s --- | - define void @basic_insert_dcache_wb() { + define amdgpu_kernel void @basic_insert_dcache_wb() { ret void } - define void @explicit_flush_after() { + define amdgpu_kernel void @explicit_flush_after() { ret void } - define void @explicit_flush_before() { + define amdgpu_kernel void @explicit_flush_before() { ret void } - define void @no_scalar_store() { + define amdgpu_kernel void @no_scalar_store() { ret void } - define void @multi_block_store() { + define amdgpu_kernel void @multi_block_store() { bb0: br i1 undef, label %bb1, label %bb2 @@ -28,7 +28,7 @@ ret void } - define void @one_block_store() { + define amdgpu_kernel void @one_block_store() { bb0: br i1 undef, label %bb1, label %bb2 @@ -169,5 +169,5 @@ tracksRegLiveness: false body: | bb.0: S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 - SI_RETURN undef %vgpr0 + SI_RETURN_TO_EPILOG undef %vgpr0 ... diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll index 32df16778a91..62d0d9367885 100644 --- a/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,15 +1,15 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; XXX - Why the packing? -; FUNC-LABEL: {{^}}scalar_to_vector_v2i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]] -; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]] -; SI: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]] -; SI: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]] -; SI: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}} -define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +; GCN-LABEL: {{^}}scalar_to_vector_v2i32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]] +; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]] +; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]] +; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]] +; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}} +define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tmp1 = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %tmp1 to <2 x i16> %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> @@ -17,11 +17,11 @@ define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace( ret void } -; FUNC-LABEL: {{^}}scalar_to_vector_v2f32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_dwordx2 -define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { +; GCN-LABEL: {{^}}scalar_to_vector_v2f32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] +; GCN: buffer_store_dwordx2 +define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tmp1 = load float, float addrspace(1)* %in, align 4 %bc = bitcast float %tmp1 to <2 x i16> %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> @@ -33,7 +33,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac ; to produce one, but for some reason never made it to selection. -; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { ; %tmp1 = load i32, i32 addrspace(1)* %in, align 4 ; %bc = bitcast i32 %tmp1 to <4 x i8> @@ -42,7 +42,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac ; ret void ; } -; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { +; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { ; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0 ; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1 ; %bc = bitcast <2 x i64> %newvec1 to <4 x i32> @@ -51,7 +51,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac ; ret void ; } -; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { +; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { ; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0 ; %bc = bitcast <4 x i32> %newvec0 to <8 x i16> ; %add = add <8 x i16> %bc, @@ -59,7 +59,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac ; ret void ; } -; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { +; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { ; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 ; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> ; %add = add <4 x i16> %bc, @@ -67,10 +67,9 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac ; ret void ; } -; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 -; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> -; %add = add <4 x i16> %bc, -; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 -; ret void -; } +define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind { + %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 + %bc = bitcast <4 x i8> %newvec0 to <2 x half> + store <2 x half> %bc, <2 x half> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll index e040639a2d94..60abd83546d3 100644 --- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll +++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll @@ -1,81 +1,85 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts +; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s +; REQUIRES: asserts -define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = fcmp ult float %1, 0.000000e+00 - %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00 - %6 = fsub float -0.000000e+00, %5 - %7 = fptosi float %6 to i32 - %8 = bitcast i32 %7 to float - %9 = fcmp ult float %0, 5.700000e+01 - %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00 - %11 = fsub float -0.000000e+00, %10 - %12 = fptosi float %11 to i32 - %13 = bitcast i32 %12 to float - %14 = bitcast float %8 to i32 - %15 = bitcast float %13 to i32 - %16 = and i32 %14, %15 - %17 = bitcast i32 %16 to float - %18 = bitcast float %17 to i32 - %19 = icmp ne i32 %18, 0 - %20 = fcmp ult float %0, 0.000000e+00 - %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00 - %22 = fsub float -0.000000e+00, %21 - %23 = fptosi float %22 to i32 - %24 = bitcast i32 %23 to float - %25 = bitcast float %24 to i32 - %26 = icmp ne i32 %25, 0 - br i1 %19, label %IF, label %ELSE + %tmp = extractelement <4 x float> %reg1, i32 0 + %tmp5 = extractelement <4 x float> %reg1, i32 1 + %tmp6 = extractelement <4 x float> %reg1, i32 2 + %tmp7 = extractelement <4 x float> %reg1, i32 3 + %tmp8 = fcmp ult float %tmp5, 0.000000e+00 + %tmp9 = select i1 %tmp8, float 1.000000e+00, float 0.000000e+00 + %tmp10 = fsub float -0.000000e+00, %tmp9 + %tmp11 = fptosi float %tmp10 to i32 + %tmp12 = bitcast i32 %tmp11 to float + %tmp13 = fcmp ult float %tmp, 5.700000e+01 + %tmp14 = select i1 %tmp13, float 1.000000e+00, float 0.000000e+00 + %tmp15 = fsub float -0.000000e+00, %tmp14 + %tmp16 = fptosi float %tmp15 to i32 + %tmp17 = bitcast i32 %tmp16 to float + %tmp18 = bitcast float %tmp12 to i32 + %tmp19 = bitcast float %tmp17 to i32 + %tmp20 = and i32 %tmp18, %tmp19 + %tmp21 = bitcast i32 %tmp20 to float + %tmp22 = bitcast float %tmp21 to i32 + %tmp23 = icmp ne i32 %tmp22, 0 + %tmp24 = fcmp ult float %tmp, 0.000000e+00 + %tmp25 = select i1 %tmp24, float 1.000000e+00, float 0.000000e+00 + %tmp26 = fsub float -0.000000e+00, %tmp25 + %tmp27 = fptosi float %tmp26 to i32 + %tmp28 = bitcast i32 %tmp27 to float + %tmp29 = bitcast float %tmp28 to i32 + %tmp30 = icmp ne i32 %tmp29, 0 + br i1 %tmp23, label %IF, label %ELSE IF: ; preds = %main_body - %. = select i1 %26, float 0.000000e+00, float 1.000000e+00 - %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00 + %. = select i1 %tmp30, float 0.000000e+00, float 1.000000e+00 + %.18 = select i1 %tmp30, float 1.000000e+00, float 0.000000e+00 br label %ENDIF ELSE: ; preds = %main_body - br i1 %26, label %ENDIF, label %ELSE17 + br i1 %tmp30, label %ENDIF, label %ELSE17 ENDIF: ; preds = %ELSE17, %ELSE, %IF - %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ] - %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ] - %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ] - %27 = call float @llvm.AMDGPU.clamp.f32(float %temp.0, float 0.000000e+00, float 1.000000e+00) - %28 = call float @llvm.AMDGPU.clamp.f32(float %temp1.0, float 0.000000e+00, float 1.000000e+00) - %29 = call float @llvm.AMDGPU.clamp.f32(float %temp2.0, float 0.000000e+00, float 1.000000e+00) - %30 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %31 = insertelement <4 x float> undef, float %27, i32 0 - %32 = insertelement <4 x float> %31, float %28, i32 1 - %33 = insertelement <4 x float> %32, float %29, i32 2 - %34 = insertelement <4 x float> %33, float %30, i32 3 - call void @llvm.r600.store.swizzle(<4 x float> %34, i32 0, i32 0) + %temp1.0 = phi float [ %., %IF ], [ %tmp48, %ELSE17 ], [ 0.000000e+00, %ELSE ] + %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %tmp49, %ELSE17 ], [ 1.000000e+00, %ELSE ] + %temp.0 = phi float [ %.18, %IF ], [ %tmp47, %ELSE17 ], [ 0.000000e+00, %ELSE ] + %max.0.i = call float @llvm.maxnum.f32(float %temp.0, float 0.000000e+00) + %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00) + %max.0.i3 = call float @llvm.maxnum.f32(float %temp1.0, float 0.000000e+00) + %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00) + %max.0.i1 = call float @llvm.maxnum.f32(float %temp2.0, float 0.000000e+00) + %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) + %tmp31 = insertelement <4 x float> undef, float %clamp.i, i32 0 + %tmp32 = insertelement <4 x float> %tmp31, float %clamp.i4, i32 1 + %tmp33 = insertelement <4 x float> %tmp32, float %clamp.i2, i32 2 + %tmp34 = insertelement <4 x float> %tmp33, float 1.000000e+00, i32 3 + call void @llvm.r600.store.swizzle(<4 x float> %tmp34, i32 0, i32 0) ret void ELSE17: ; preds = %ELSE - %35 = fadd float 0.000000e+00, 0x3FC99999A0000000 - %36 = fadd float 0.000000e+00, 0x3FC99999A0000000 - %37 = fadd float 0.000000e+00, 0x3FC99999A0000000 - %38 = fadd float %35, 0x3FC99999A0000000 - %39 = fadd float %36, 0x3FC99999A0000000 - %40 = fadd float %37, 0x3FC99999A0000000 - %41 = fadd float %38, 0x3FC99999A0000000 - %42 = fadd float %39, 0x3FC99999A0000000 - %43 = fadd float %40, 0x3FC99999A0000000 - %44 = fadd float %41, 0x3FC99999A0000000 - %45 = fadd float %42, 0x3FC99999A0000000 - %46 = fadd float %43, 0x3FC99999A0000000 - %47 = fadd float %44, 0x3FC99999A0000000 - %48 = fadd float %45, 0x3FC99999A0000000 - %49 = fadd float %46, 0x3FC99999A0000000 + %tmp35 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %tmp36 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %tmp37 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %tmp38 = fadd float %tmp35, 0x3FC99999A0000000 + %tmp39 = fadd float %tmp36, 0x3FC99999A0000000 + %tmp40 = fadd float %tmp37, 0x3FC99999A0000000 + %tmp41 = fadd float %tmp38, 0x3FC99999A0000000 + %tmp42 = fadd float %tmp39, 0x3FC99999A0000000 + %tmp43 = fadd float %tmp40, 0x3FC99999A0000000 + %tmp44 = fadd float %tmp41, 0x3FC99999A0000000 + %tmp45 = fadd float %tmp42, 0x3FC99999A0000000 + %tmp46 = fadd float %tmp43, 0x3FC99999A0000000 + %tmp47 = fadd float %tmp44, 0x3FC99999A0000000 + %tmp48 = fadd float %tmp45, 0x3FC99999A0000000 + %tmp49 = fadd float %tmp46, 0x3FC99999A0000000 br label %ENDIF } -declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll index f907e154f962..177957c0b35b 100644 --- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll +++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll @@ -1,88 +1,91 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts +; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s +; REQUIRES: asserts -define void @main() { +define amdgpu_kernel void @main() #0 { main_body: - %0 = load <4 x float>, <4 x float> addrspace(9)* null - %1 = extractelement <4 x float> %0, i32 3 - %2 = fptosi float %1 to i32 - %3 = bitcast i32 %2 to float - %4 = bitcast float %3 to i32 - %5 = sdiv i32 %4, 4 - %6 = bitcast i32 %5 to float - %7 = bitcast float %6 to i32 - %8 = mul i32 %7, 4 - %9 = bitcast i32 %8 to float - %10 = bitcast float %9 to i32 - %11 = sub i32 0, %10 - %12 = bitcast i32 %11 to float - %13 = bitcast float %3 to i32 - %14 = bitcast float %12 to i32 - %15 = add i32 %13, %14 - %16 = bitcast i32 %15 to float - %17 = load <4 x float>, <4 x float> addrspace(9)* null - %18 = extractelement <4 x float> %17, i32 0 - %19 = load <4 x float>, <4 x float> addrspace(9)* null - %20 = extractelement <4 x float> %19, i32 1 - %21 = load <4 x float>, <4 x float> addrspace(9)* null - %22 = extractelement <4 x float> %21, i32 2 + %tmp = load <4 x float>, <4 x float> addrspace(9)* null + %tmp5 = extractelement <4 x float> %tmp, i32 3 + %tmp6 = fptosi float %tmp5 to i32 + %tmp7 = bitcast i32 %tmp6 to float + %tmp8 = bitcast float %tmp7 to i32 + %tmp9 = sdiv i32 %tmp8, 4 + %tmp10 = bitcast i32 %tmp9 to float + %tmp11 = bitcast float %tmp10 to i32 + %tmp12 = mul i32 %tmp11, 4 + %tmp13 = bitcast i32 %tmp12 to float + %tmp14 = bitcast float %tmp13 to i32 + %tmp15 = sub i32 0, %tmp14 + %tmp16 = bitcast i32 %tmp15 to float + %tmp17 = bitcast float %tmp7 to i32 + %tmp18 = bitcast float %tmp16 to i32 + %tmp19 = add i32 %tmp17, %tmp18 + %tmp20 = bitcast i32 %tmp19 to float + %tmp21 = load <4 x float>, <4 x float> addrspace(9)* null + %tmp22 = extractelement <4 x float> %tmp21, i32 0 + %tmp23 = load <4 x float>, <4 x float> addrspace(9)* null + %tmp24 = extractelement <4 x float> %tmp23, i32 1 + %tmp25 = load <4 x float>, <4 x float> addrspace(9)* null + %tmp26 = extractelement <4 x float> %tmp25, i32 2 br label %LOOP LOOP: ; preds = %IF31, %main_body - %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ] - %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ] - %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ] - %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ] - %23 = bitcast float %temp12.0 to i32 - %24 = bitcast float %6 to i32 - %25 = icmp sge i32 %23, %24 - %26 = sext i1 %25 to i32 - %27 = bitcast i32 %26 to float - %28 = bitcast float %27 to i32 - %29 = icmp ne i32 %28, 0 - br i1 %29, label %IF, label %LOOP29 + %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %tmp47, %IF31 ] + %temp6.0 = phi float [ %tmp26, %main_body ], [ %temp6.1, %IF31 ] + %temp5.0 = phi float [ %tmp24, %main_body ], [ %temp5.1, %IF31 ] + %temp4.0 = phi float [ %tmp22, %main_body ], [ %temp4.1, %IF31 ] + %tmp27 = bitcast float %temp12.0 to i32 + %tmp28 = bitcast float %tmp10 to i32 + %tmp29 = icmp sge i32 %tmp27, %tmp28 + %tmp30 = sext i1 %tmp29 to i32 + %tmp31 = bitcast i32 %tmp30 to float + %tmp32 = bitcast float %tmp31 to i32 + %tmp33 = icmp ne i32 %tmp32, 0 + br i1 %tmp33, label %IF, label %LOOP29 IF: ; preds = %LOOP - %30 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00) - %31 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00) - %32 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00) - %33 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %34 = insertelement <4 x float> undef, float %30, i32 0 - %35 = insertelement <4 x float> %34, float %31, i32 1 - %36 = insertelement <4 x float> %35, float %32, i32 2 - %37 = insertelement <4 x float> %36, float %33, i32 3 - call void @llvm.r600.store.swizzle(<4 x float> %37, i32 0, i32 0) + %max.0.i = call float @llvm.maxnum.f32(float %temp4.0, float 0.000000e+00) + %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00) + %max.0.i3 = call float @llvm.maxnum.f32(float %temp5.0, float 0.000000e+00) + %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00) + %max.0.i1 = call float @llvm.maxnum.f32(float %temp6.0, float 0.000000e+00) + %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) + %tmp34 = insertelement <4 x float> undef, float %clamp.i, i32 0 + %tmp35 = insertelement <4 x float> %tmp34, float %clamp.i4, i32 1 + %tmp36 = insertelement <4 x float> %tmp35, float %clamp.i2, i32 2 + %tmp37 = insertelement <4 x float> %tmp36, float 1.000000e+00, i32 3 + call void @llvm.r600.store.swizzle(<4 x float> %tmp37, i32 0, i32 0) ret void -LOOP29: ; preds = %LOOP, %ENDIF30 +LOOP29: ; preds = %ENDIF30, %LOOP %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ] %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ] %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ] - %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ] - %38 = bitcast float %temp20.0 to i32 - %39 = bitcast float %16 to i32 - %40 = icmp sge i32 %38, %39 - %41 = sext i1 %40 to i32 - %42 = bitcast i32 %41 to float - %43 = bitcast float %42 to i32 - %44 = icmp ne i32 %43, 0 - br i1 %44, label %IF31, label %ENDIF30 + %temp20.0 = phi float [ %tmp50, %ENDIF30 ], [ 0.000000e+00, %LOOP ] + %tmp38 = bitcast float %temp20.0 to i32 + %tmp39 = bitcast float %tmp20 to i32 + %tmp40 = icmp sge i32 %tmp38, %tmp39 + %tmp41 = sext i1 %tmp40 to i32 + %tmp42 = bitcast i32 %tmp41 to float + %tmp43 = bitcast float %tmp42 to i32 + %tmp44 = icmp ne i32 %tmp43, 0 + br i1 %tmp44, label %IF31, label %ENDIF30 IF31: ; preds = %LOOP29 - %45 = bitcast float %temp12.0 to i32 - %46 = add i32 %45, 1 - %47 = bitcast i32 %46 to float + %tmp45 = bitcast float %temp12.0 to i32 + %tmp46 = add i32 %tmp45, 1 + %tmp47 = bitcast i32 %tmp46 to float br label %LOOP ENDIF30: ; preds = %LOOP29 - %48 = bitcast float %temp20.0 to i32 - %49 = add i32 %48, 1 - %50 = bitcast i32 %49 to float + %tmp48 = bitcast float %temp20.0 to i32 + %tmp49 = add i32 %tmp48, 1 + %tmp50 = bitcast i32 %tmp49 to float br label %LOOP29 } -declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0 +declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #0 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 -declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/test/CodeGen/AMDGPU/schedule-fs-loop.ll index 5839785f00d5..6cd419f6cfc4 100644 --- a/test/CodeGen/AMDGPU/schedule-fs-loop.ll +++ b/test/CodeGen/AMDGPU/schedule-fs-loop.ll @@ -1,55 +1,84 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts +; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s +; REQUIRES: asserts -define void @main() { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { main_body: - %0 = load <4 x float>, <4 x float> addrspace(9)* null - %1 = extractelement <4 x float> %0, i32 3 - %2 = fptosi float %1 to i32 - %3 = bitcast i32 %2 to float - %4 = load <4 x float>, <4 x float> addrspace(9)* null - %5 = extractelement <4 x float> %4, i32 0 - %6 = load <4 x float>, <4 x float> addrspace(9)* null - %7 = extractelement <4 x float> %6, i32 1 - %8 = load <4 x float>, <4 x float> addrspace(9)* null - %9 = extractelement <4 x float> %8, i32 2 - br label %LOOP + %tmp = extractelement <4 x float> %reg1, i32 0 + %tmp5 = extractelement <4 x float> %reg1, i32 1 + %tmp6 = extractelement <4 x float> %reg1, i32 2 + %tmp7 = extractelement <4 x float> %reg1, i32 3 + %tmp8 = fcmp ult float %tmp5, 0.000000e+00 + %tmp9 = select i1 %tmp8, float 1.000000e+00, float 0.000000e+00 + %tmp10 = fsub float -0.000000e+00, %tmp9 + %tmp11 = fptosi float %tmp10 to i32 + %tmp12 = bitcast i32 %tmp11 to float + %tmp13 = fcmp ult float %tmp, 5.700000e+01 + %tmp14 = select i1 %tmp13, float 1.000000e+00, float 0.000000e+00 + %tmp15 = fsub float -0.000000e+00, %tmp14 + %tmp16 = fptosi float %tmp15 to i32 + %tmp17 = bitcast i32 %tmp16 to float + %tmp18 = bitcast float %tmp12 to i32 + %tmp19 = bitcast float %tmp17 to i32 + %tmp20 = and i32 %tmp18, %tmp19 + %tmp21 = bitcast i32 %tmp20 to float + %tmp22 = bitcast float %tmp21 to i32 + %tmp23 = icmp ne i32 %tmp22, 0 + %tmp24 = fcmp ult float %tmp, 0.000000e+00 + %tmp25 = select i1 %tmp24, float 1.000000e+00, float 0.000000e+00 + %tmp26 = fsub float -0.000000e+00, %tmp25 + %tmp27 = fptosi float %tmp26 to i32 + %tmp28 = bitcast i32 %tmp27 to float + %tmp29 = bitcast float %tmp28 to i32 + %tmp30 = icmp ne i32 %tmp29, 0 + br i1 %tmp23, label %IF, label %ELSE -LOOP: ; preds = %ENDIF, %main_body - %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ] - %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ] - %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ] - %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ] - %10 = bitcast float %temp8.0 to i32 - %11 = bitcast float %3 to i32 - %12 = icmp sge i32 %10, %11 - %13 = sext i1 %12 to i32 - %14 = bitcast i32 %13 to float - %15 = bitcast float %14 to i32 - %16 = icmp ne i32 %15, 0 - br i1 %16, label %IF, label %ENDIF +IF: ; preds = %main_body + %. = select i1 %tmp30, float 0.000000e+00, float 1.000000e+00 + %.18 = select i1 %tmp30, float 1.000000e+00, float 0.000000e+00 + br label %ENDIF -IF: ; preds = %LOOP - %17 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00) - %18 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00) - %19 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00) - %20 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %21 = insertelement <4 x float> undef, float %17, i32 0 - %22 = insertelement <4 x float> %21, float %18, i32 1 - %23 = insertelement <4 x float> %22, float %19, i32 2 - %24 = insertelement <4 x float> %23, float %20, i32 3 - call void @llvm.r600.store.swizzle(<4 x float> %24, i32 0, i32 0) +ELSE: ; preds = %main_body + br i1 %tmp30, label %ENDIF, label %ELSE17 + +ENDIF: ; preds = %ELSE17, %ELSE, %IF + %temp1.0 = phi float [ %., %IF ], [ %tmp48, %ELSE17 ], [ 0.000000e+00, %ELSE ] + %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %tmp49, %ELSE17 ], [ 1.000000e+00, %ELSE ] + %temp.0 = phi float [ %.18, %IF ], [ %tmp47, %ELSE17 ], [ 0.000000e+00, %ELSE ] + %max.0.i = call float @llvm.maxnum.f32(float %temp.0, float 0.000000e+00) + %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00) + %max.0.i3 = call float @llvm.maxnum.f32(float %temp1.0, float 0.000000e+00) + %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00) + %max.0.i1 = call float @llvm.maxnum.f32(float %temp2.0, float 0.000000e+00) + %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) + %tmp31 = insertelement <4 x float> undef, float %clamp.i, i32 0 + %tmp32 = insertelement <4 x float> %tmp31, float %clamp.i4, i32 1 + %tmp33 = insertelement <4 x float> %tmp32, float %clamp.i2, i32 2 + %tmp34 = insertelement <4 x float> %tmp33, float 1.000000e+00, i32 3 + call void @llvm.r600.store.swizzle(<4 x float> %tmp34, i32 0, i32 0) ret void -ENDIF: ; preds = %LOOP - %25 = bitcast float %temp8.0 to i32 - %26 = add i32 %25, 1 - %27 = bitcast i32 %26 to float - br label %LOOP +ELSE17: ; preds = %ELSE + %tmp35 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %tmp36 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %tmp37 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %tmp38 = fadd float %tmp35, 0x3FC99999A0000000 + %tmp39 = fadd float %tmp36, 0x3FC99999A0000000 + %tmp40 = fadd float %tmp37, 0x3FC99999A0000000 + %tmp41 = fadd float %tmp38, 0x3FC99999A0000000 + %tmp42 = fadd float %tmp39, 0x3FC99999A0000000 + %tmp43 = fadd float %tmp40, 0x3FC99999A0000000 + %tmp44 = fadd float %tmp41, 0x3FC99999A0000000 + %tmp45 = fadd float %tmp42, 0x3FC99999A0000000 + %tmp46 = fadd float %tmp43, 0x3FC99999A0000000 + %tmp47 = fadd float %tmp44, 0x3FC99999A0000000 + %tmp48 = fadd float %tmp45, 0x3FC99999A0000000 + %tmp49 = fadd float %tmp46, 0x3FC99999A0000000 + br label %ENDIF } -declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0 - -declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #0 -attributes #0 = { readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll index 32c456bd2ceb..44d46086f02a 100644 --- a/test/CodeGen/AMDGPU/schedule-global-loads.ll +++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll @@ -10,7 +10,7 @@ ; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 ; SI: buffer_store_dword [[REG0]] ; SI: buffer_store_dword [[REG1]] -define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { %load0 = load i32, i32 addrspace(1)* %ptr, align 4 %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2 %load1 = load i32, i32 addrspace(1)* %gep, align 4 @@ -24,7 +24,7 @@ define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* ; FUNC-LABEL: {{^}}same_base_ptr_crash: ; SI: buffer_load_dword ; SI: buffer_load_dword -define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { +define amdgpu_kernel void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { entry: %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset %tmp0 = load i32, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/schedule-if-2.ll b/test/CodeGen/AMDGPU/schedule-if-2.ll index aa67b2e0f7db..964298a55318 100644 --- a/test/CodeGen/AMDGPU/schedule-if-2.ll +++ b/test/CodeGen/AMDGPU/schedule-if-2.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs ;REQUIRES: asserts -define void @main() { +define amdgpu_kernel void @main() { main_body: %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) %1 = extractelement <4 x float> %0, i32 0 diff --git a/test/CodeGen/AMDGPU/schedule-if.ll b/test/CodeGen/AMDGPU/schedule-if.ll index 6637b3897717..feac5d918f63 100644 --- a/test/CodeGen/AMDGPU/schedule-if.ll +++ b/test/CodeGen/AMDGPU/schedule-if.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs ;REQUIRES: asserts -define void @main() { +define amdgpu_kernel void @main() { main_body: %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) %1 = extractelement <4 x float> %0, i32 0 diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll index ccfde7b9adc5..5c47c163dcce 100644 --- a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll +++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll @@ -12,7 +12,7 @@ ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 -define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { +define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { store i32 %x, i32 addrspace(1)* %out0, align 4 store i32 %y, i32 addrspace(1)* %out1, align 4 ret void @@ -26,7 +26,7 @@ define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, ; GCN: s_load_dwordx2 ; GCN: s_load_dwordx2 ; GCN: s_endpgm -define void @same_base_ptr_crash(i64 addrspace(1)* %out, +define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, diff --git a/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll new file mode 100644 index 000000000000..4520fe86136f --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll @@ -0,0 +1,591 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s + +; We expect a two digit VGPR usage here, not a three digit. +; CHECK: NumVgprs: {{[0-9][0-9]$}} + +define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) { +bb: + %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1 + %tmp2 = load float, float addrspace(3)* %tmp, align 4 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2 + %tmp4 = load float, float addrspace(3)* %tmp3, align 4 + %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3 + %tmp6 = load float, float addrspace(3)* %tmp5, align 4 + %tmp7 = tail call float @llvm.fmuladd.f32(float %tmp2, float %tmp4, float %tmp6) + %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5 + %tmp9 = load float, float addrspace(3)* %tmp8, align 4 + %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6 + %tmp11 = load float, float addrspace(3)* %tmp10, align 4 + %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7 + %tmp13 = load float, float addrspace(3)* %tmp12, align 4 + %tmp14 = tail call float @llvm.fmuladd.f32(float %tmp9, float %tmp11, float %tmp13) + %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9 + %tmp16 = load float, float addrspace(3)* %tmp15, align 4 + %tmp17 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10 + %tmp18 = load float, float addrspace(3)* %tmp17, align 4 + %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11 + %tmp20 = load float, float addrspace(3)* %tmp19, align 4 + %tmp21 = tail call float @llvm.fmuladd.f32(float %tmp16, float %tmp18, float %tmp20) + %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13 + %tmp23 = load float, float addrspace(3)* %tmp22, align 4 + %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14 + %tmp25 = load float, float addrspace(3)* %tmp24, align 4 + %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15 + %tmp27 = load float, float addrspace(3)* %tmp26, align 4 + %tmp28 = tail call float @llvm.fmuladd.f32(float %tmp23, float %tmp25, float %tmp27) + %tmp29 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17 + %tmp30 = load float, float addrspace(3)* %tmp29, align 4 + %tmp31 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18 + %tmp32 = load float, float addrspace(3)* %tmp31, align 4 + %tmp33 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19 + %tmp34 = load float, float addrspace(3)* %tmp33, align 4 + %tmp35 = tail call float @llvm.fmuladd.f32(float %tmp30, float %tmp32, float %tmp34) + %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21 + %tmp37 = load float, float addrspace(3)* %tmp36, align 4 + %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22 + %tmp39 = load float, float addrspace(3)* %tmp38, align 4 + %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23 + %tmp41 = load float, float addrspace(3)* %tmp40, align 4 + %tmp42 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41) + %tmp43 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25 + %tmp44 = load float, float addrspace(3)* %tmp43, align 4 + %tmp45 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26 + %tmp46 = load float, float addrspace(3)* %tmp45, align 4 + %tmp47 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27 + %tmp48 = load float, float addrspace(3)* %tmp47, align 4 + %tmp49 = tail call float @llvm.fmuladd.f32(float %tmp44, float %tmp46, float %tmp48) + %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29 + %tmp51 = load float, float addrspace(3)* %tmp50, align 4 + %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30 + %tmp53 = load float, float addrspace(3)* %tmp52, align 4 + %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 31 + %tmp55 = load float, float addrspace(3)* %tmp54, align 4 + %tmp56 = tail call float @llvm.fmuladd.f32(float %tmp51, float %tmp53, float %tmp55) + %tmp57 = getelementptr inbounds float, float addrspace(3)* %arg, i32 33 + %tmp58 = load float, float addrspace(3)* %tmp57, align 4 + %tmp59 = getelementptr inbounds float, float addrspace(3)* %arg, i32 34 + %tmp60 = load float, float addrspace(3)* %tmp59, align 4 + %tmp61 = getelementptr inbounds float, float addrspace(3)* %arg, i32 35 + %tmp62 = load float, float addrspace(3)* %tmp61, align 4 + %tmp63 = tail call float @llvm.fmuladd.f32(float %tmp58, float %tmp60, float %tmp62) + %tmp64 = getelementptr inbounds float, float addrspace(3)* %arg, i32 37 + %tmp65 = load float, float addrspace(3)* %tmp64, align 4 + %tmp66 = getelementptr inbounds float, float addrspace(3)* %arg, i32 38 + %tmp67 = load float, float addrspace(3)* %tmp66, align 4 + %tmp68 = getelementptr inbounds float, float addrspace(3)* %arg, i32 39 + %tmp69 = load float, float addrspace(3)* %tmp68, align 4 + %tmp70 = tail call float @llvm.fmuladd.f32(float %tmp65, float %tmp67, float %tmp69) + %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg, i32 41 + %tmp72 = load float, float addrspace(3)* %tmp71, align 4 + %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg, i32 42 + %tmp74 = load float, float addrspace(3)* %tmp73, align 4 + %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg, i32 43 + %tmp76 = load float, float addrspace(3)* %tmp75, align 4 + %tmp77 = tail call float @llvm.fmuladd.f32(float %tmp72, float %tmp74, float %tmp76) + %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg, i32 45 + %tmp79 = load float, float addrspace(3)* %tmp78, align 4 + %tmp80 = getelementptr inbounds float, float addrspace(3)* %arg, i32 46 + %tmp81 = load float, float addrspace(3)* %tmp80, align 4 + %tmp82 = getelementptr inbounds float, float addrspace(3)* %arg, i32 47 + %tmp83 = load float, float addrspace(3)* %tmp82, align 4 + %tmp84 = tail call float @llvm.fmuladd.f32(float %tmp79, float %tmp81, float %tmp83) + %tmp85 = getelementptr inbounds float, float addrspace(3)* %arg, i32 49 + %tmp86 = load float, float addrspace(3)* %tmp85, align 4 + %tmp87 = getelementptr inbounds float, float addrspace(3)* %arg, i32 50 + %tmp88 = load float, float addrspace(3)* %tmp87, align 4 + %tmp89 = getelementptr inbounds float, float addrspace(3)* %arg, i32 51 + %tmp90 = load float, float addrspace(3)* %tmp89, align 4 + %tmp91 = tail call float @llvm.fmuladd.f32(float %tmp86, float %tmp88, float %tmp90) + %tmp92 = getelementptr inbounds float, float addrspace(3)* %arg, i32 53 + %tmp93 = load float, float addrspace(3)* %tmp92, align 4 + %tmp94 = getelementptr inbounds float, float addrspace(3)* %arg, i32 54 + %tmp95 = load float, float addrspace(3)* %tmp94, align 4 + %tmp96 = getelementptr inbounds float, float addrspace(3)* %arg, i32 55 + %tmp97 = load float, float addrspace(3)* %tmp96, align 4 + %tmp98 = tail call float @llvm.fmuladd.f32(float %tmp93, float %tmp95, float %tmp97) + %tmp99 = getelementptr inbounds float, float addrspace(3)* %arg, i32 57 + %tmp100 = load float, float addrspace(3)* %tmp99, align 4 + %tmp101 = getelementptr inbounds float, float addrspace(3)* %arg, i32 58 + %tmp102 = load float, float addrspace(3)* %tmp101, align 4 + %tmp103 = getelementptr inbounds float, float addrspace(3)* %arg, i32 59 + %tmp104 = load float, float addrspace(3)* %tmp103, align 4 + %tmp105 = tail call float @llvm.fmuladd.f32(float %tmp100, float %tmp102, float %tmp104) + %tmp106 = getelementptr inbounds float, float addrspace(3)* %arg, i32 61 + %tmp107 = load float, float addrspace(3)* %tmp106, align 4 + %tmp108 = getelementptr inbounds float, float addrspace(3)* %arg, i32 62 + %tmp109 = load float, float addrspace(3)* %tmp108, align 4 + %tmp110 = getelementptr inbounds float, float addrspace(3)* %arg, i32 63 + %tmp111 = load float, float addrspace(3)* %tmp110, align 4 + %tmp112 = tail call float @llvm.fmuladd.f32(float %tmp107, float %tmp109, float %tmp111) + %tmp113 = getelementptr inbounds float, float addrspace(3)* %arg, i32 65 + %tmp114 = load float, float addrspace(3)* %tmp113, align 4 + %tmp115 = getelementptr inbounds float, float addrspace(3)* %arg, i32 66 + %tmp116 = load float, float addrspace(3)* %tmp115, align 4 + %tmp117 = getelementptr inbounds float, float addrspace(3)* %arg, i32 67 + %tmp118 = load float, float addrspace(3)* %tmp117, align 4 + %tmp119 = tail call float @llvm.fmuladd.f32(float %tmp114, float %tmp116, float %tmp118) + %tmp120 = getelementptr inbounds float, float addrspace(3)* %arg, i32 69 + %tmp121 = load float, float addrspace(3)* %tmp120, align 4 + %tmp122 = getelementptr inbounds float, float addrspace(3)* %arg, i32 70 + %tmp123 = load float, float addrspace(3)* %tmp122, align 4 + %tmp124 = getelementptr inbounds float, float addrspace(3)* %arg, i32 71 + %tmp125 = load float, float addrspace(3)* %tmp124, align 4 + %tmp126 = tail call float @llvm.fmuladd.f32(float %tmp121, float %tmp123, float %tmp125) + %tmp127 = getelementptr inbounds float, float addrspace(3)* %arg, i32 73 + %tmp128 = load float, float addrspace(3)* %tmp127, align 4 + %tmp129 = getelementptr inbounds float, float addrspace(3)* %arg, i32 74 + %tmp130 = load float, float addrspace(3)* %tmp129, align 4 + %tmp131 = getelementptr inbounds float, float addrspace(3)* %arg, i32 75 + %tmp132 = load float, float addrspace(3)* %tmp131, align 4 + %tmp133 = tail call float @llvm.fmuladd.f32(float %tmp128, float %tmp130, float %tmp132) + %tmp134 = getelementptr inbounds float, float addrspace(3)* %arg, i32 77 + %tmp135 = load float, float addrspace(3)* %tmp134, align 4 + %tmp136 = getelementptr inbounds float, float addrspace(3)* %arg, i32 78 + %tmp137 = load float, float addrspace(3)* %tmp136, align 4 + %tmp138 = getelementptr inbounds float, float addrspace(3)* %arg, i32 79 + %tmp139 = load float, float addrspace(3)* %tmp138, align 4 + %tmp140 = tail call float @llvm.fmuladd.f32(float %tmp135, float %tmp137, float %tmp139) + %tmp141 = getelementptr inbounds float, float addrspace(3)* %arg, i32 81 + %tmp142 = load float, float addrspace(3)* %tmp141, align 4 + %tmp143 = getelementptr inbounds float, float addrspace(3)* %arg, i32 82 + %tmp144 = load float, float addrspace(3)* %tmp143, align 4 + %tmp145 = getelementptr inbounds float, float addrspace(3)* %arg, i32 83 + %tmp146 = load float, float addrspace(3)* %tmp145, align 4 + %tmp147 = tail call float @llvm.fmuladd.f32(float %tmp142, float %tmp144, float %tmp146) + %tmp148 = getelementptr inbounds float, float addrspace(3)* %arg, i32 85 + %tmp149 = load float, float addrspace(3)* %tmp148, align 4 + %tmp150 = getelementptr inbounds float, float addrspace(3)* %arg, i32 86 + %tmp151 = load float, float addrspace(3)* %tmp150, align 4 + %tmp152 = getelementptr inbounds float, float addrspace(3)* %arg, i32 87 + %tmp153 = load float, float addrspace(3)* %tmp152, align 4 + %tmp154 = tail call float @llvm.fmuladd.f32(float %tmp149, float %tmp151, float %tmp153) + %tmp155 = getelementptr inbounds float, float addrspace(3)* %arg, i32 89 + %tmp156 = load float, float addrspace(3)* %tmp155, align 4 + %tmp157 = getelementptr inbounds float, float addrspace(3)* %arg, i32 90 + %tmp158 = load float, float addrspace(3)* %tmp157, align 4 + %tmp159 = getelementptr inbounds float, float addrspace(3)* %arg, i32 91 + %tmp160 = load float, float addrspace(3)* %tmp159, align 4 + %tmp161 = tail call float @llvm.fmuladd.f32(float %tmp156, float %tmp158, float %tmp160) + %tmp162 = getelementptr inbounds float, float addrspace(3)* %arg, i32 93 + %tmp163 = load float, float addrspace(3)* %tmp162, align 4 + %tmp164 = getelementptr inbounds float, float addrspace(3)* %arg, i32 94 + %tmp165 = load float, float addrspace(3)* %tmp164, align 4 + %tmp166 = getelementptr inbounds float, float addrspace(3)* %arg, i32 95 + %tmp167 = load float, float addrspace(3)* %tmp166, align 4 + %tmp168 = tail call float @llvm.fmuladd.f32(float %tmp163, float %tmp165, float %tmp167) + %tmp169 = getelementptr inbounds float, float addrspace(3)* %arg, i32 97 + %tmp170 = load float, float addrspace(3)* %tmp169, align 4 + %tmp171 = getelementptr inbounds float, float addrspace(3)* %arg, i32 98 + %tmp172 = load float, float addrspace(3)* %tmp171, align 4 + %tmp173 = getelementptr inbounds float, float addrspace(3)* %arg, i32 99 + %tmp174 = load float, float addrspace(3)* %tmp173, align 4 + %tmp175 = tail call float @llvm.fmuladd.f32(float %tmp170, float %tmp172, float %tmp174) + %tmp176 = getelementptr inbounds float, float addrspace(3)* %arg, i32 101 + %tmp177 = load float, float addrspace(3)* %tmp176, align 4 + %tmp178 = getelementptr inbounds float, float addrspace(3)* %arg, i32 102 + %tmp179 = load float, float addrspace(3)* %tmp178, align 4 + %tmp180 = getelementptr inbounds float, float addrspace(3)* %arg, i32 103 + %tmp181 = load float, float addrspace(3)* %tmp180, align 4 + %tmp182 = tail call float @llvm.fmuladd.f32(float %tmp177, float %tmp179, float %tmp181) + %tmp183 = getelementptr inbounds float, float addrspace(3)* %arg, i32 105 + %tmp184 = load float, float addrspace(3)* %tmp183, align 4 + %tmp185 = getelementptr inbounds float, float addrspace(3)* %arg, i32 106 + %tmp186 = load float, float addrspace(3)* %tmp185, align 4 + %tmp187 = getelementptr inbounds float, float addrspace(3)* %arg, i32 107 + %tmp188 = load float, float addrspace(3)* %tmp187, align 4 + %tmp189 = tail call float @llvm.fmuladd.f32(float %tmp184, float %tmp186, float %tmp188) + %tmp190 = getelementptr inbounds float, float addrspace(3)* %arg, i32 109 + %tmp191 = load float, float addrspace(3)* %tmp190, align 4 + %tmp192 = getelementptr inbounds float, float addrspace(3)* %arg, i32 110 + %tmp193 = load float, float addrspace(3)* %tmp192, align 4 + %tmp194 = getelementptr inbounds float, float addrspace(3)* %arg, i32 111 + %tmp195 = load float, float addrspace(3)* %tmp194, align 4 + %tmp196 = tail call float @llvm.fmuladd.f32(float %tmp191, float %tmp193, float %tmp195) + %tmp197 = getelementptr inbounds float, float addrspace(3)* %arg, i32 113 + %tmp198 = load float, float addrspace(3)* %tmp197, align 4 + %tmp199 = getelementptr inbounds float, float addrspace(3)* %arg, i32 114 + %tmp200 = load float, float addrspace(3)* %tmp199, align 4 + %tmp201 = getelementptr inbounds float, float addrspace(3)* %arg, i32 115 + %tmp202 = load float, float addrspace(3)* %tmp201, align 4 + %tmp203 = tail call float @llvm.fmuladd.f32(float %tmp198, float %tmp200, float %tmp202) + %tmp204 = getelementptr inbounds float, float addrspace(3)* %arg, i32 117 + %tmp205 = load float, float addrspace(3)* %tmp204, align 4 + %tmp206 = getelementptr inbounds float, float addrspace(3)* %arg, i32 118 + %tmp207 = load float, float addrspace(3)* %tmp206, align 4 + %tmp208 = getelementptr inbounds float, float addrspace(3)* %arg, i32 119 + %tmp209 = load float, float addrspace(3)* %tmp208, align 4 + %tmp210 = tail call float @llvm.fmuladd.f32(float %tmp205, float %tmp207, float %tmp209) + %tmp211 = getelementptr inbounds float, float addrspace(3)* %arg, i32 121 + %tmp212 = load float, float addrspace(3)* %tmp211, align 4 + %tmp213 = getelementptr inbounds float, float addrspace(3)* %arg, i32 122 + %tmp214 = load float, float addrspace(3)* %tmp213, align 4 + %tmp215 = getelementptr inbounds float, float addrspace(3)* %arg, i32 123 + %tmp216 = load float, float addrspace(3)* %tmp215, align 4 + %tmp217 = tail call float @llvm.fmuladd.f32(float %tmp212, float %tmp214, float %tmp216) + %tmp218 = getelementptr inbounds float, float addrspace(3)* %arg, i32 125 + %tmp219 = load float, float addrspace(3)* %tmp218, align 4 + %tmp220 = getelementptr inbounds float, float addrspace(3)* %arg, i32 126 + %tmp221 = load float, float addrspace(3)* %tmp220, align 4 + %tmp222 = getelementptr inbounds float, float addrspace(3)* %arg, i32 127 + %tmp223 = load float, float addrspace(3)* %tmp222, align 4 + %tmp224 = tail call float @llvm.fmuladd.f32(float %tmp219, float %tmp221, float %tmp223) + %tmp225 = getelementptr inbounds float, float addrspace(3)* %arg, i32 129 + %tmp226 = load float, float addrspace(3)* %tmp225, align 4 + %tmp227 = getelementptr inbounds float, float addrspace(3)* %arg, i32 130 + %tmp228 = load float, float addrspace(3)* %tmp227, align 4 + %tmp229 = getelementptr inbounds float, float addrspace(3)* %arg, i32 131 + %tmp230 = load float, float addrspace(3)* %tmp229, align 4 + %tmp231 = tail call float @llvm.fmuladd.f32(float %tmp226, float %tmp228, float %tmp230) + %tmp232 = getelementptr inbounds float, float addrspace(3)* %arg, i32 133 + %tmp233 = load float, float addrspace(3)* %tmp232, align 4 + %tmp234 = getelementptr inbounds float, float addrspace(3)* %arg, i32 134 + %tmp235 = load float, float addrspace(3)* %tmp234, align 4 + %tmp236 = getelementptr inbounds float, float addrspace(3)* %arg, i32 135 + %tmp237 = load float, float addrspace(3)* %tmp236, align 4 + %tmp238 = tail call float @llvm.fmuladd.f32(float %tmp233, float %tmp235, float %tmp237) + %tmp239 = getelementptr inbounds float, float addrspace(3)* %arg, i32 137 + %tmp240 = load float, float addrspace(3)* %tmp239, align 4 + %tmp241 = getelementptr inbounds float, float addrspace(3)* %arg, i32 138 + %tmp242 = load float, float addrspace(3)* %tmp241, align 4 + %tmp243 = getelementptr inbounds float, float addrspace(3)* %arg, i32 139 + %tmp244 = load float, float addrspace(3)* %tmp243, align 4 + %tmp245 = tail call float @llvm.fmuladd.f32(float %tmp240, float %tmp242, float %tmp244) + %tmp246 = getelementptr inbounds float, float addrspace(3)* %arg, i32 141 + %tmp247 = load float, float addrspace(3)* %tmp246, align 4 + %tmp248 = getelementptr inbounds float, float addrspace(3)* %arg, i32 142 + %tmp249 = load float, float addrspace(3)* %tmp248, align 4 + %tmp250 = getelementptr inbounds float, float addrspace(3)* %arg, i32 143 + %tmp251 = load float, float addrspace(3)* %tmp250, align 4 + %tmp252 = tail call float @llvm.fmuladd.f32(float %tmp247, float %tmp249, float %tmp251) + %tmp253 = getelementptr inbounds float, float addrspace(3)* %arg, i32 145 + %tmp254 = load float, float addrspace(3)* %tmp253, align 4 + %tmp255 = getelementptr inbounds float, float addrspace(3)* %arg, i32 146 + %tmp256 = load float, float addrspace(3)* %tmp255, align 4 + %tmp257 = getelementptr inbounds float, float addrspace(3)* %arg, i32 147 + %tmp258 = load float, float addrspace(3)* %tmp257, align 4 + %tmp259 = tail call float @llvm.fmuladd.f32(float %tmp254, float %tmp256, float %tmp258) + %tmp260 = getelementptr inbounds float, float addrspace(3)* %arg, i32 149 + %tmp261 = load float, float addrspace(3)* %tmp260, align 4 + %tmp262 = getelementptr inbounds float, float addrspace(3)* %arg, i32 150 + %tmp263 = load float, float addrspace(3)* %tmp262, align 4 + %tmp264 = getelementptr inbounds float, float addrspace(3)* %arg, i32 151 + %tmp265 = load float, float addrspace(3)* %tmp264, align 4 + %tmp266 = tail call float @llvm.fmuladd.f32(float %tmp261, float %tmp263, float %tmp265) + %tmp267 = getelementptr inbounds float, float addrspace(3)* %arg, i32 153 + %tmp268 = load float, float addrspace(3)* %tmp267, align 4 + %tmp269 = getelementptr inbounds float, float addrspace(3)* %arg, i32 154 + %tmp270 = load float, float addrspace(3)* %tmp269, align 4 + %tmp271 = getelementptr inbounds float, float addrspace(3)* %arg, i32 155 + %tmp272 = load float, float addrspace(3)* %tmp271, align 4 + %tmp273 = tail call float @llvm.fmuladd.f32(float %tmp268, float %tmp270, float %tmp272) + %tmp274 = getelementptr inbounds float, float addrspace(3)* %arg, i32 157 + %tmp275 = load float, float addrspace(3)* %tmp274, align 4 + %tmp276 = getelementptr inbounds float, float addrspace(3)* %arg, i32 158 + %tmp277 = load float, float addrspace(3)* %tmp276, align 4 + %tmp278 = getelementptr inbounds float, float addrspace(3)* %arg, i32 159 + %tmp279 = load float, float addrspace(3)* %tmp278, align 4 + %tmp280 = tail call float @llvm.fmuladd.f32(float %tmp275, float %tmp277, float %tmp279) + %tmp281 = getelementptr inbounds float, float addrspace(3)* %arg, i32 161 + %tmp282 = load float, float addrspace(3)* %tmp281, align 4 + %tmp283 = getelementptr inbounds float, float addrspace(3)* %arg, i32 162 + %tmp284 = load float, float addrspace(3)* %tmp283, align 4 + %tmp285 = getelementptr inbounds float, float addrspace(3)* %arg, i32 163 + %tmp286 = load float, float addrspace(3)* %tmp285, align 4 + %tmp287 = tail call float @llvm.fmuladd.f32(float %tmp282, float %tmp284, float %tmp286) + %tmp288 = getelementptr inbounds float, float addrspace(3)* %arg, i32 165 + %tmp289 = load float, float addrspace(3)* %tmp288, align 4 + %tmp290 = getelementptr inbounds float, float addrspace(3)* %arg, i32 166 + %tmp291 = load float, float addrspace(3)* %tmp290, align 4 + %tmp292 = getelementptr inbounds float, float addrspace(3)* %arg, i32 167 + %tmp293 = load float, float addrspace(3)* %tmp292, align 4 + %tmp294 = tail call float @llvm.fmuladd.f32(float %tmp289, float %tmp291, float %tmp293) + %tmp295 = getelementptr inbounds float, float addrspace(3)* %arg, i32 169 + %tmp296 = load float, float addrspace(3)* %tmp295, align 4 + %tmp297 = getelementptr inbounds float, float addrspace(3)* %arg, i32 170 + %tmp298 = load float, float addrspace(3)* %tmp297, align 4 + %tmp299 = getelementptr inbounds float, float addrspace(3)* %arg, i32 171 + %tmp300 = load float, float addrspace(3)* %tmp299, align 4 + %tmp301 = tail call float @llvm.fmuladd.f32(float %tmp296, float %tmp298, float %tmp300) + %tmp302 = getelementptr inbounds float, float addrspace(3)* %arg, i32 173 + %tmp303 = load float, float addrspace(3)* %tmp302, align 4 + %tmp304 = getelementptr inbounds float, float addrspace(3)* %arg, i32 174 + %tmp305 = load float, float addrspace(3)* %tmp304, align 4 + %tmp306 = getelementptr inbounds float, float addrspace(3)* %arg, i32 175 + %tmp307 = load float, float addrspace(3)* %tmp306, align 4 + %tmp308 = tail call float @llvm.fmuladd.f32(float %tmp303, float %tmp305, float %tmp307) + %tmp309 = getelementptr inbounds float, float addrspace(3)* %arg, i32 177 + %tmp310 = load float, float addrspace(3)* %tmp309, align 4 + %tmp311 = getelementptr inbounds float, float addrspace(3)* %arg, i32 178 + %tmp312 = load float, float addrspace(3)* %tmp311, align 4 + %tmp313 = getelementptr inbounds float, float addrspace(3)* %arg, i32 179 + %tmp314 = load float, float addrspace(3)* %tmp313, align 4 + %tmp315 = tail call float @llvm.fmuladd.f32(float %tmp310, float %tmp312, float %tmp314) + %tmp316 = getelementptr inbounds float, float addrspace(3)* %arg, i32 181 + %tmp317 = load float, float addrspace(3)* %tmp316, align 4 + %tmp318 = getelementptr inbounds float, float addrspace(3)* %arg, i32 182 + %tmp319 = load float, float addrspace(3)* %tmp318, align 4 + %tmp320 = getelementptr inbounds float, float addrspace(3)* %arg, i32 183 + %tmp321 = load float, float addrspace(3)* %tmp320, align 4 + %tmp322 = tail call float @llvm.fmuladd.f32(float %tmp317, float %tmp319, float %tmp321) + %tmp323 = getelementptr inbounds float, float addrspace(3)* %arg, i32 185 + %tmp324 = load float, float addrspace(3)* %tmp323, align 4 + %tmp325 = getelementptr inbounds float, float addrspace(3)* %arg, i32 186 + %tmp326 = load float, float addrspace(3)* %tmp325, align 4 + %tmp327 = getelementptr inbounds float, float addrspace(3)* %arg, i32 187 + %tmp328 = load float, float addrspace(3)* %tmp327, align 4 + %tmp329 = tail call float @llvm.fmuladd.f32(float %tmp324, float %tmp326, float %tmp328) + %tmp330 = getelementptr inbounds float, float addrspace(3)* %arg, i32 189 + %tmp331 = load float, float addrspace(3)* %tmp330, align 4 + %tmp332 = getelementptr inbounds float, float addrspace(3)* %arg, i32 190 + %tmp333 = load float, float addrspace(3)* %tmp332, align 4 + %tmp334 = getelementptr inbounds float, float addrspace(3)* %arg, i32 191 + %tmp335 = load float, float addrspace(3)* %tmp334, align 4 + %tmp336 = tail call float @llvm.fmuladd.f32(float %tmp331, float %tmp333, float %tmp335) + %tmp337 = getelementptr inbounds float, float addrspace(3)* %arg, i32 193 + %tmp338 = load float, float addrspace(3)* %tmp337, align 4 + %tmp339 = getelementptr inbounds float, float addrspace(3)* %arg, i32 194 + %tmp340 = load float, float addrspace(3)* %tmp339, align 4 + %tmp341 = getelementptr inbounds float, float addrspace(3)* %arg, i32 195 + %tmp342 = load float, float addrspace(3)* %tmp341, align 4 + %tmp343 = tail call float @llvm.fmuladd.f32(float %tmp338, float %tmp340, float %tmp342) + %tmp344 = getelementptr inbounds float, float addrspace(3)* %arg, i32 197 + %tmp345 = load float, float addrspace(3)* %tmp344, align 4 + %tmp346 = getelementptr inbounds float, float addrspace(3)* %arg, i32 198 + %tmp347 = load float, float addrspace(3)* %tmp346, align 4 + %tmp348 = getelementptr inbounds float, float addrspace(3)* %arg, i32 199 + %tmp349 = load float, float addrspace(3)* %tmp348, align 4 + %tmp350 = tail call float @llvm.fmuladd.f32(float %tmp345, float %tmp347, float %tmp349) + %tmp351 = getelementptr inbounds float, float addrspace(3)* %arg, i32 201 + %tmp352 = load float, float addrspace(3)* %tmp351, align 4 + %tmp353 = getelementptr inbounds float, float addrspace(3)* %arg, i32 202 + %tmp354 = load float, float addrspace(3)* %tmp353, align 4 + %tmp355 = getelementptr inbounds float, float addrspace(3)* %arg, i32 203 + %tmp356 = load float, float addrspace(3)* %tmp355, align 4 + %tmp357 = tail call float @llvm.fmuladd.f32(float %tmp352, float %tmp354, float %tmp356) + %tmp358 = getelementptr inbounds float, float addrspace(3)* %arg, i32 205 + %tmp359 = load float, float addrspace(3)* %tmp358, align 4 + %tmp360 = getelementptr inbounds float, float addrspace(3)* %arg, i32 206 + %tmp361 = load float, float addrspace(3)* %tmp360, align 4 + %tmp362 = getelementptr inbounds float, float addrspace(3)* %arg, i32 207 + %tmp363 = load float, float addrspace(3)* %tmp362, align 4 + %tmp364 = tail call float @llvm.fmuladd.f32(float %tmp359, float %tmp361, float %tmp363) + %tmp365 = getelementptr inbounds float, float addrspace(3)* %arg, i32 209 + %tmp366 = load float, float addrspace(3)* %tmp365, align 4 + %tmp367 = getelementptr inbounds float, float addrspace(3)* %arg, i32 210 + %tmp368 = load float, float addrspace(3)* %tmp367, align 4 + %tmp369 = getelementptr inbounds float, float addrspace(3)* %arg, i32 211 + %tmp370 = load float, float addrspace(3)* %tmp369, align 4 + %tmp371 = tail call float @llvm.fmuladd.f32(float %tmp366, float %tmp368, float %tmp370) + %tmp372 = getelementptr inbounds float, float addrspace(3)* %arg, i32 213 + %tmp373 = load float, float addrspace(3)* %tmp372, align 4 + %tmp374 = getelementptr inbounds float, float addrspace(3)* %arg, i32 214 + %tmp375 = load float, float addrspace(3)* %tmp374, align 4 + %tmp376 = getelementptr inbounds float, float addrspace(3)* %arg, i32 215 + %tmp377 = load float, float addrspace(3)* %tmp376, align 4 + %tmp378 = tail call float @llvm.fmuladd.f32(float %tmp373, float %tmp375, float %tmp377) + %tmp379 = getelementptr inbounds float, float addrspace(3)* %arg, i32 217 + %tmp380 = load float, float addrspace(3)* %tmp379, align 4 + %tmp381 = getelementptr inbounds float, float addrspace(3)* %arg, i32 218 + %tmp382 = load float, float addrspace(3)* %tmp381, align 4 + %tmp383 = getelementptr inbounds float, float addrspace(3)* %arg, i32 219 + %tmp384 = load float, float addrspace(3)* %tmp383, align 4 + %tmp385 = tail call float @llvm.fmuladd.f32(float %tmp380, float %tmp382, float %tmp384) + %tmp386 = getelementptr inbounds float, float addrspace(3)* %arg, i32 221 + %tmp387 = load float, float addrspace(3)* %tmp386, align 4 + %tmp388 = getelementptr inbounds float, float addrspace(3)* %arg, i32 222 + %tmp389 = load float, float addrspace(3)* %tmp388, align 4 + %tmp390 = getelementptr inbounds float, float addrspace(3)* %arg, i32 223 + %tmp391 = load float, float addrspace(3)* %tmp390, align 4 + %tmp392 = tail call float @llvm.fmuladd.f32(float %tmp387, float %tmp389, float %tmp391) + %tmp393 = getelementptr inbounds float, float addrspace(3)* %arg, i32 225 + %tmp394 = load float, float addrspace(3)* %tmp393, align 4 + %tmp395 = getelementptr inbounds float, float addrspace(3)* %arg, i32 226 + %tmp396 = load float, float addrspace(3)* %tmp395, align 4 + %tmp397 = getelementptr inbounds float, float addrspace(3)* %arg, i32 227 + %tmp398 = load float, float addrspace(3)* %tmp397, align 4 + %tmp399 = tail call float @llvm.fmuladd.f32(float %tmp394, float %tmp396, float %tmp398) + %tmp400 = getelementptr inbounds float, float addrspace(3)* %arg, i32 229 + %tmp401 = load float, float addrspace(3)* %tmp400, align 4 + %tmp402 = getelementptr inbounds float, float addrspace(3)* %arg, i32 230 + %tmp403 = load float, float addrspace(3)* %tmp402, align 4 + %tmp404 = getelementptr inbounds float, float addrspace(3)* %arg, i32 231 + %tmp405 = load float, float addrspace(3)* %tmp404, align 4 + %tmp406 = tail call float @llvm.fmuladd.f32(float %tmp401, float %tmp403, float %tmp405) + %tmp407 = getelementptr inbounds float, float addrspace(3)* %arg, i32 233 + %tmp408 = load float, float addrspace(3)* %tmp407, align 4 + %tmp409 = getelementptr inbounds float, float addrspace(3)* %arg, i32 234 + %tmp410 = load float, float addrspace(3)* %tmp409, align 4 + %tmp411 = getelementptr inbounds float, float addrspace(3)* %arg, i32 235 + %tmp412 = load float, float addrspace(3)* %tmp411, align 4 + %tmp413 = tail call float @llvm.fmuladd.f32(float %tmp408, float %tmp410, float %tmp412) + %tmp414 = getelementptr inbounds float, float addrspace(3)* %arg, i32 237 + %tmp415 = load float, float addrspace(3)* %tmp414, align 4 + %tmp416 = getelementptr inbounds float, float addrspace(3)* %arg, i32 238 + %tmp417 = load float, float addrspace(3)* %tmp416, align 4 + %tmp418 = getelementptr inbounds float, float addrspace(3)* %arg, i32 239 + %tmp419 = load float, float addrspace(3)* %tmp418, align 4 + %tmp420 = tail call float @llvm.fmuladd.f32(float %tmp415, float %tmp417, float %tmp419) + %tmp421 = getelementptr inbounds float, float addrspace(3)* %arg, i32 241 + %tmp422 = load float, float addrspace(3)* %tmp421, align 4 + %tmp423 = getelementptr inbounds float, float addrspace(3)* %arg, i32 242 + %tmp424 = load float, float addrspace(3)* %tmp423, align 4 + %tmp425 = getelementptr inbounds float, float addrspace(3)* %arg, i32 243 + %tmp426 = load float, float addrspace(3)* %tmp425, align 4 + %tmp427 = tail call float @llvm.fmuladd.f32(float %tmp422, float %tmp424, float %tmp426) + %tmp428 = getelementptr inbounds float, float addrspace(3)* %arg, i32 245 + %tmp429 = load float, float addrspace(3)* %tmp428, align 4 + %tmp430 = getelementptr inbounds float, float addrspace(3)* %arg, i32 246 + %tmp431 = load float, float addrspace(3)* %tmp430, align 4 + %tmp432 = getelementptr inbounds float, float addrspace(3)* %arg, i32 247 + %tmp433 = load float, float addrspace(3)* %tmp432, align 4 + %tmp434 = tail call float @llvm.fmuladd.f32(float %tmp429, float %tmp431, float %tmp433) + %tmp435 = getelementptr inbounds float, float addrspace(3)* %arg, i32 249 + %tmp436 = load float, float addrspace(3)* %tmp435, align 4 + %tmp437 = getelementptr inbounds float, float addrspace(3)* %arg, i32 250 + %tmp438 = load float, float addrspace(3)* %tmp437, align 4 + %tmp439 = getelementptr inbounds float, float addrspace(3)* %arg, i32 251 + %tmp440 = load float, float addrspace(3)* %tmp439, align 4 + %tmp441 = tail call float @llvm.fmuladd.f32(float %tmp436, float %tmp438, float %tmp440) + %tmp442 = getelementptr inbounds float, float addrspace(3)* %arg, i32 253 + %tmp443 = load float, float addrspace(3)* %tmp442, align 4 + %tmp444 = getelementptr inbounds float, float addrspace(3)* %arg, i32 254 + %tmp445 = load float, float addrspace(3)* %tmp444, align 4 + %tmp446 = getelementptr inbounds float, float addrspace(3)* %arg, i32 255 + %tmp447 = load float, float addrspace(3)* %tmp446, align 4 + %tmp448 = tail call float @llvm.fmuladd.f32(float %tmp443, float %tmp445, float %tmp447) + store float %tmp7, float addrspace(1)* %arg1, align 4 + %tmp449 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 1 + store float %tmp14, float addrspace(1)* %tmp449, align 4 + %tmp450 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 2 + store float %tmp21, float addrspace(1)* %tmp450, align 4 + %tmp451 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 3 + store float %tmp28, float addrspace(1)* %tmp451, align 4 + %tmp452 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 4 + store float %tmp35, float addrspace(1)* %tmp452, align 4 + %tmp453 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 5 + store float %tmp42, float addrspace(1)* %tmp453, align 4 + %tmp454 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 6 + store float %tmp49, float addrspace(1)* %tmp454, align 4 + %tmp455 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 7 + store float %tmp56, float addrspace(1)* %tmp455, align 4 + %tmp456 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 8 + store float %tmp63, float addrspace(1)* %tmp456, align 4 + %tmp457 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 9 + store float %tmp70, float addrspace(1)* %tmp457, align 4 + %tmp458 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 10 + store float %tmp77, float addrspace(1)* %tmp458, align 4 + %tmp459 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 11 + store float %tmp84, float addrspace(1)* %tmp459, align 4 + %tmp460 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 12 + store float %tmp91, float addrspace(1)* %tmp460, align 4 + %tmp461 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 13 + store float %tmp98, float addrspace(1)* %tmp461, align 4 + %tmp462 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 14 + store float %tmp105, float addrspace(1)* %tmp462, align 4 + %tmp463 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 15 + store float %tmp112, float addrspace(1)* %tmp463, align 4 + %tmp464 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 16 + store float %tmp119, float addrspace(1)* %tmp464, align 4 + %tmp465 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 17 + store float %tmp126, float addrspace(1)* %tmp465, align 4 + %tmp466 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 18 + store float %tmp133, float addrspace(1)* %tmp466, align 4 + %tmp467 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 19 + store float %tmp140, float addrspace(1)* %tmp467, align 4 + %tmp468 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 20 + store float %tmp147, float addrspace(1)* %tmp468, align 4 + %tmp469 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 21 + store float %tmp154, float addrspace(1)* %tmp469, align 4 + %tmp470 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 22 + store float %tmp161, float addrspace(1)* %tmp470, align 4 + %tmp471 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 23 + store float %tmp168, float addrspace(1)* %tmp471, align 4 + %tmp472 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 24 + store float %tmp175, float addrspace(1)* %tmp472, align 4 + %tmp473 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 25 + store float %tmp182, float addrspace(1)* %tmp473, align 4 + %tmp474 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 26 + store float %tmp189, float addrspace(1)* %tmp474, align 4 + %tmp475 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 27 + store float %tmp196, float addrspace(1)* %tmp475, align 4 + %tmp476 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 28 + store float %tmp203, float addrspace(1)* %tmp476, align 4 + %tmp477 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 29 + store float %tmp210, float addrspace(1)* %tmp477, align 4 + %tmp478 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 30 + store float %tmp217, float addrspace(1)* %tmp478, align 4 + %tmp479 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 31 + store float %tmp224, float addrspace(1)* %tmp479, align 4 + %tmp480 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 32 + store float %tmp231, float addrspace(1)* %tmp480, align 4 + %tmp481 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 33 + store float %tmp238, float addrspace(1)* %tmp481, align 4 + %tmp482 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 34 + store float %tmp245, float addrspace(1)* %tmp482, align 4 + %tmp483 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 35 + store float %tmp252, float addrspace(1)* %tmp483, align 4 + %tmp484 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 36 + store float %tmp259, float addrspace(1)* %tmp484, align 4 + %tmp485 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 37 + store float %tmp266, float addrspace(1)* %tmp485, align 4 + %tmp486 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 38 + store float %tmp273, float addrspace(1)* %tmp486, align 4 + %tmp487 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 39 + store float %tmp280, float addrspace(1)* %tmp487, align 4 + %tmp488 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 40 + store float %tmp287, float addrspace(1)* %tmp488, align 4 + %tmp489 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 41 + store float %tmp294, float addrspace(1)* %tmp489, align 4 + %tmp490 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 42 + store float %tmp301, float addrspace(1)* %tmp490, align 4 + %tmp491 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 43 + store float %tmp308, float addrspace(1)* %tmp491, align 4 + %tmp492 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 44 + store float %tmp315, float addrspace(1)* %tmp492, align 4 + %tmp493 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 45 + store float %tmp322, float addrspace(1)* %tmp493, align 4 + %tmp494 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 46 + store float %tmp329, float addrspace(1)* %tmp494, align 4 + %tmp495 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 47 + store float %tmp336, float addrspace(1)* %tmp495, align 4 + %tmp496 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 48 + store float %tmp343, float addrspace(1)* %tmp496, align 4 + %tmp497 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 49 + store float %tmp350, float addrspace(1)* %tmp497, align 4 + %tmp498 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 50 + store float %tmp357, float addrspace(1)* %tmp498, align 4 + %tmp499 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 51 + store float %tmp364, float addrspace(1)* %tmp499, align 4 + %tmp500 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 52 + store float %tmp371, float addrspace(1)* %tmp500, align 4 + %tmp501 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 53 + store float %tmp378, float addrspace(1)* %tmp501, align 4 + %tmp502 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 54 + store float %tmp385, float addrspace(1)* %tmp502, align 4 + %tmp503 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 55 + store float %tmp392, float addrspace(1)* %tmp503, align 4 + %tmp504 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 56 + store float %tmp399, float addrspace(1)* %tmp504, align 4 + %tmp505 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 57 + store float %tmp406, float addrspace(1)* %tmp505, align 4 + %tmp506 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 58 + store float %tmp413, float addrspace(1)* %tmp506, align 4 + %tmp507 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 59 + store float %tmp420, float addrspace(1)* %tmp507, align 4 + %tmp508 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 60 + store float %tmp427, float addrspace(1)* %tmp508, align 4 + %tmp509 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 61 + store float %tmp434, float addrspace(1)* %tmp509, align 4 + %tmp510 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 62 + store float %tmp441, float addrspace(1)* %tmp510, align 4 + %tmp511 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 63 + store float %tmp448, float addrspace(1)* %tmp511, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.fmuladd.f32(float, float, float) #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll new file mode 100644 index 000000000000..0d19c1e6a8f3 --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -0,0 +1,288 @@ +; RUN: llc -march=amdgcn -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s + +; SI: NumSgprs: {{[1-9]$}} +; SI: NumVgprs: {{[1-9]$}} + +; stores may alias loads +; VI: NumSgprs: {{[1-5][0-9]$}} +; VI: NumVgprs: {{[1-3][0-9]$}} + +define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) { +bb: + %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004 + %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252 + %adr.c.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20508 + %adr.a.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20772 + %adr.b.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21020 + %adr.c.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21276 + %adr.a.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21540 + %adr.b.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21788 + %adr.c.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22044 + %adr.a.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22308 + %adr.b.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22556 + %adr.c.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22812 + %adr.a.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23076 + %adr.b.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23324 + %adr.c.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23580 + %adr.a.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23844 + %adr.b.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24092 + %adr.c.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24348 + %adr.a.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24612 + %adr.b.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24860 + %adr.c.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25116 + %adr.a.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25380 + %adr.b.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25628 + %adr.c.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25884 + %adr.a.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26148 + %adr.b.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26396 + %adr.c.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26652 + %adr.a.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26916 + %adr.b.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27164 + %adr.c.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27420 + %adr.a.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27684 + %adr.b.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27932 + %adr.c.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28188 + %adr.a.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28452 + %adr.b.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28700 + %adr.c.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28956 + %adr.a.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29220 + %adr.b.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29468 + %adr.c.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29724 + %adr.a.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29988 + %adr.b.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30236 + %adr.c.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30492 + %adr.a.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30756 + %adr.b.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31004 + %adr.c.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31260 + %adr.a.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31524 + %adr.b.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31772 + %adr.c.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32028 + %adr.a.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32292 + %adr.b.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32540 + %adr.c.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32796 + %adr.a.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33060 + %adr.b.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33308 + %adr.c.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33564 + %adr.a.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33828 + %adr.b.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34076 + %adr.c.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34332 + %adr.a.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34596 + %adr.b.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34844 + %adr.c.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35100 + %adr.a.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35364 + %adr.b.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35612 + %adr.c.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35868 + %adr.a.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36132 + %adr.b.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36380 + %adr.c.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36636 + %adr.a.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36900 + %adr.b.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37148 + %adr.c.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37404 + %adr.a.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37668 + %adr.b.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37916 + %adr.c.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38172 + %adr.a.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38436 + %adr.b.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38684 + %adr.c.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38940 + %adr.a.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39204 + %adr.b.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39452 + %adr.c.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39708 + %adr.a.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39972 + %adr.b.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40220 + %adr.c.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40476 + %adr.a.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40740 + %adr.b.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40988 + %adr.c.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41244 + %adr.a.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41508 + %adr.b.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41756 + %adr.c.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42012 + %adr.a.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42276 + %adr.b.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42524 + %adr.c.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42780 + %a.0 = load float, float addrspace(3)* %adr.a.0, align 4 + %b.0 = load float, float addrspace(3)* %adr.b.0, align 4 + %c.0 = load float, float addrspace(3)* %adr.c.0, align 4 + %a.1 = load float, float addrspace(3)* %adr.a.1, align 4 + %b.1 = load float, float addrspace(3)* %adr.b.1, align 4 + %c.1 = load float, float addrspace(3)* %adr.c.1, align 4 + %a.2 = load float, float addrspace(3)* %adr.a.2, align 4 + %b.2 = load float, float addrspace(3)* %adr.b.2, align 4 + %c.2 = load float, float addrspace(3)* %adr.c.2, align 4 + %a.3 = load float, float addrspace(3)* %adr.a.3, align 4 + %b.3 = load float, float addrspace(3)* %adr.b.3, align 4 + %c.3 = load float, float addrspace(3)* %adr.c.3, align 4 + %a.4 = load float, float addrspace(3)* %adr.a.4, align 4 + %b.4 = load float, float addrspace(3)* %adr.b.4, align 4 + %c.4 = load float, float addrspace(3)* %adr.c.4, align 4 + %a.5 = load float, float addrspace(3)* %adr.a.5, align 4 + %b.5 = load float, float addrspace(3)* %adr.b.5, align 4 + %c.5 = load float, float addrspace(3)* %adr.c.5, align 4 + %a.6 = load float, float addrspace(3)* %adr.a.6, align 4 + %b.6 = load float, float addrspace(3)* %adr.b.6, align 4 + %c.6 = load float, float addrspace(3)* %adr.c.6, align 4 + %a.7 = load float, float addrspace(3)* %adr.a.7, align 4 + %b.7 = load float, float addrspace(3)* %adr.b.7, align 4 + %c.7 = load float, float addrspace(3)* %adr.c.7, align 4 + %a.8 = load float, float addrspace(3)* %adr.a.8, align 4 + %b.8 = load float, float addrspace(3)* %adr.b.8, align 4 + %c.8 = load float, float addrspace(3)* %adr.c.8, align 4 + %a.9 = load float, float addrspace(3)* %adr.a.9, align 4 + %b.9 = load float, float addrspace(3)* %adr.b.9, align 4 + %c.9 = load float, float addrspace(3)* %adr.c.9, align 4 + %a.10 = load float, float addrspace(3)* %adr.a.10, align 4 + %b.10 = load float, float addrspace(3)* %adr.b.10, align 4 + %c.10 = load float, float addrspace(3)* %adr.c.10, align 4 + %a.11 = load float, float addrspace(3)* %adr.a.11, align 4 + %b.11 = load float, float addrspace(3)* %adr.b.11, align 4 + %c.11 = load float, float addrspace(3)* %adr.c.11, align 4 + %a.12 = load float, float addrspace(3)* %adr.a.12, align 4 + %b.12 = load float, float addrspace(3)* %adr.b.12, align 4 + %c.12 = load float, float addrspace(3)* %adr.c.12, align 4 + %a.13 = load float, float addrspace(3)* %adr.a.13, align 4 + %b.13 = load float, float addrspace(3)* %adr.b.13, align 4 + %c.13 = load float, float addrspace(3)* %adr.c.13, align 4 + %a.14 = load float, float addrspace(3)* %adr.a.14, align 4 + %b.14 = load float, float addrspace(3)* %adr.b.14, align 4 + %c.14 = load float, float addrspace(3)* %adr.c.14, align 4 + %a.15 = load float, float addrspace(3)* %adr.a.15, align 4 + %b.15 = load float, float addrspace(3)* %adr.b.15, align 4 + %c.15 = load float, float addrspace(3)* %adr.c.15, align 4 + %a.16 = load float, float addrspace(3)* %adr.a.16, align 4 + %b.16 = load float, float addrspace(3)* %adr.b.16, align 4 + %c.16 = load float, float addrspace(3)* %adr.c.16, align 4 + %a.17 = load float, float addrspace(3)* %adr.a.17, align 4 + %b.17 = load float, float addrspace(3)* %adr.b.17, align 4 + %c.17 = load float, float addrspace(3)* %adr.c.17, align 4 + %a.18 = load float, float addrspace(3)* %adr.a.18, align 4 + %b.18 = load float, float addrspace(3)* %adr.b.18, align 4 + %c.18 = load float, float addrspace(3)* %adr.c.18, align 4 + %a.19 = load float, float addrspace(3)* %adr.a.19, align 4 + %b.19 = load float, float addrspace(3)* %adr.b.19, align 4 + %c.19 = load float, float addrspace(3)* %adr.c.19, align 4 + %a.20 = load float, float addrspace(3)* %adr.a.20, align 4 + %b.20 = load float, float addrspace(3)* %adr.b.20, align 4 + %c.20 = load float, float addrspace(3)* %adr.c.20, align 4 + %a.21 = load float, float addrspace(3)* %adr.a.21, align 4 + %b.21 = load float, float addrspace(3)* %adr.b.21, align 4 + %c.21 = load float, float addrspace(3)* %adr.c.21, align 4 + %a.22 = load float, float addrspace(3)* %adr.a.22, align 4 + %b.22 = load float, float addrspace(3)* %adr.b.22, align 4 + %c.22 = load float, float addrspace(3)* %adr.c.22, align 4 + %a.23 = load float, float addrspace(3)* %adr.a.23, align 4 + %b.23 = load float, float addrspace(3)* %adr.b.23, align 4 + %c.23 = load float, float addrspace(3)* %adr.c.23, align 4 + %a.24 = load float, float addrspace(3)* %adr.a.24, align 4 + %b.24 = load float, float addrspace(3)* %adr.b.24, align 4 + %c.24 = load float, float addrspace(3)* %adr.c.24, align 4 + %a.25 = load float, float addrspace(3)* %adr.a.25, align 4 + %b.25 = load float, float addrspace(3)* %adr.b.25, align 4 + %c.25 = load float, float addrspace(3)* %adr.c.25, align 4 + %a.26 = load float, float addrspace(3)* %adr.a.26, align 4 + %b.26 = load float, float addrspace(3)* %adr.b.26, align 4 + %c.26 = load float, float addrspace(3)* %adr.c.26, align 4 + %a.27 = load float, float addrspace(3)* %adr.a.27, align 4 + %b.27 = load float, float addrspace(3)* %adr.b.27, align 4 + %c.27 = load float, float addrspace(3)* %adr.c.27, align 4 + %a.28 = load float, float addrspace(3)* %adr.a.28, align 4 + %b.28 = load float, float addrspace(3)* %adr.b.28, align 4 + %c.28 = load float, float addrspace(3)* %adr.c.28, align 4 + %a.29 = load float, float addrspace(3)* %adr.a.29, align 4 + %b.29 = load float, float addrspace(3)* %adr.b.29, align 4 + %c.29 = load float, float addrspace(3)* %adr.c.29, align 4 + %res.0 = tail call float @llvm.fmuladd.f32(float %a.0, float %b.0, float %c.0) + %res.1 = tail call float @llvm.fmuladd.f32(float %a.1, float %b.1, float %c.1) + %res.2 = tail call float @llvm.fmuladd.f32(float %a.2, float %b.2, float %c.2) + %res.3 = tail call float @llvm.fmuladd.f32(float %a.3, float %b.3, float %c.3) + %res.4 = tail call float @llvm.fmuladd.f32(float %a.4, float %b.4, float %c.4) + %res.5 = tail call float @llvm.fmuladd.f32(float %a.5, float %b.5, float %c.5) + %res.6 = tail call float @llvm.fmuladd.f32(float %a.6, float %b.6, float %c.6) + %res.7 = tail call float @llvm.fmuladd.f32(float %a.7, float %b.7, float %c.7) + %res.8 = tail call float @llvm.fmuladd.f32(float %a.8, float %b.8, float %c.8) + %res.9 = tail call float @llvm.fmuladd.f32(float %a.9, float %b.9, float %c.9) + %res.10 = tail call float @llvm.fmuladd.f32(float %a.10, float %b.10, float %c.10) + %res.11 = tail call float @llvm.fmuladd.f32(float %a.11, float %b.11, float %c.11) + %res.12 = tail call float @llvm.fmuladd.f32(float %a.12, float %b.12, float %c.12) + %res.13 = tail call float @llvm.fmuladd.f32(float %a.13, float %b.13, float %c.13) + %res.14 = tail call float @llvm.fmuladd.f32(float %a.14, float %b.14, float %c.14) + %res.15 = tail call float @llvm.fmuladd.f32(float %a.15, float %b.15, float %c.15) + %res.16 = tail call float @llvm.fmuladd.f32(float %a.16, float %b.16, float %c.16) + %res.17 = tail call float @llvm.fmuladd.f32(float %a.17, float %b.17, float %c.17) + %res.18 = tail call float @llvm.fmuladd.f32(float %a.18, float %b.18, float %c.18) + %res.19 = tail call float @llvm.fmuladd.f32(float %a.19, float %b.19, float %c.19) + %res.20 = tail call float @llvm.fmuladd.f32(float %a.20, float %b.20, float %c.20) + %res.21 = tail call float @llvm.fmuladd.f32(float %a.21, float %b.21, float %c.21) + %res.22 = tail call float @llvm.fmuladd.f32(float %a.22, float %b.22, float %c.22) + %res.23 = tail call float @llvm.fmuladd.f32(float %a.23, float %b.23, float %c.23) + %res.24 = tail call float @llvm.fmuladd.f32(float %a.24, float %b.24, float %c.24) + %res.25 = tail call float @llvm.fmuladd.f32(float %a.25, float %b.25, float %c.25) + %res.26 = tail call float @llvm.fmuladd.f32(float %a.26, float %b.26, float %c.26) + %res.27 = tail call float @llvm.fmuladd.f32(float %a.27, float %b.27, float %c.27) + %res.28 = tail call float @llvm.fmuladd.f32(float %a.28, float %b.28, float %c.28) + %res.29 = tail call float @llvm.fmuladd.f32(float %a.29, float %b.29, float %c.29) + %adr.res.0 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 0 + %adr.res.1 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 2 + %adr.res.2 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 4 + %adr.res.3 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 6 + %adr.res.4 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 8 + %adr.res.5 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 10 + %adr.res.6 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 12 + %adr.res.7 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 14 + %adr.res.8 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 16 + %adr.res.9 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 18 + %adr.res.10 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 20 + %adr.res.11 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 22 + %adr.res.12 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 24 + %adr.res.13 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 26 + %adr.res.14 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 28 + %adr.res.15 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 30 + %adr.res.16 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 32 + %adr.res.17 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 34 + %adr.res.18 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 36 + %adr.res.19 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 38 + %adr.res.20 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 40 + %adr.res.21 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 42 + %adr.res.22 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 44 + %adr.res.23 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 46 + %adr.res.24 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 48 + %adr.res.25 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 50 + %adr.res.26 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 52 + %adr.res.27 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 54 + %adr.res.28 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 56 + %adr.res.29 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 58 + store float %res.0, float addrspace(1)* %adr.res.0, align 4 + store float %res.1, float addrspace(1)* %adr.res.1, align 4 + store float %res.2, float addrspace(1)* %adr.res.2, align 4 + store float %res.3, float addrspace(1)* %adr.res.3, align 4 + store float %res.4, float addrspace(1)* %adr.res.4, align 4 + store float %res.5, float addrspace(1)* %adr.res.5, align 4 + store float %res.6, float addrspace(1)* %adr.res.6, align 4 + store float %res.7, float addrspace(1)* %adr.res.7, align 4 + store float %res.8, float addrspace(1)* %adr.res.8, align 4 + store float %res.9, float addrspace(1)* %adr.res.9, align 4 + store float %res.10, float addrspace(1)* %adr.res.10, align 4 + store float %res.11, float addrspace(1)* %adr.res.11, align 4 + store float %res.12, float addrspace(1)* %adr.res.12, align 4 + store float %res.13, float addrspace(1)* %adr.res.13, align 4 + store float %res.14, float addrspace(1)* %adr.res.14, align 4 + store float %res.15, float addrspace(1)* %adr.res.15, align 4 + store float %res.16, float addrspace(1)* %adr.res.16, align 4 + store float %res.17, float addrspace(1)* %adr.res.17, align 4 + store float %res.18, float addrspace(1)* %adr.res.18, align 4 + store float %res.19, float addrspace(1)* %adr.res.19, align 4 + store float %res.20, float addrspace(1)* %adr.res.20, align 4 + store float %res.21, float addrspace(1)* %adr.res.21, align 4 + store float %res.22, float addrspace(1)* %adr.res.22, align 4 + store float %res.23, float addrspace(1)* %adr.res.23, align 4 + store float %res.24, float addrspace(1)* %adr.res.24, align 4 + store float %res.25, float addrspace(1)* %adr.res.25, align 4 + store float %res.26, float addrspace(1)* %adr.res.26, align 4 + store float %res.27, float addrspace(1)* %adr.res.27, align 4 + store float %res.28, float addrspace(1)* %adr.res.28, align 4 + store float %res.29, float addrspace(1)* %adr.res.29, align 4 + ret void +} +declare float @llvm.fmuladd.f32(float, float, float) #0 +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/schedule-regpressure.mir b/test/CodeGen/AMDGPU/schedule-regpressure.mir new file mode 100644 index 000000000000..c71de87eeece --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-regpressure.mir @@ -0,0 +1,57 @@ +# RUN: llc -march=amdgcn -misched=converge -run-pass machine-scheduler %s -o - -debug-only=misched 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Check there is no SReg_32 pressure created by DS_* instructions because of M0 use + +# CHECK: ScheduleDAGMILive::schedule starting +# CHECK: SU({{.*}} = DS_READ_B32 {{.*}} %M0, %EXEC +# CHECK: Pressure Diff : {{$}} +# CHECK: SU({{.*}} DS_WRITE_B32 + +--- +name: mo_pset +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_128 } + - { id: 1, class: sgpr_64 } + - { id: 2, class: sreg_32_xm0 } + - { id: 3, class: sgpr_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_32_xm0_xexec } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: vgpr_32 } +liveins: + - { reg: '%sgpr4_sgpr5', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0: + liveins: %sgpr4_sgpr5 + + %1 = COPY %sgpr4_sgpr5 + %5 = S_LOAD_DWORD_IMM %1, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`) + %m0 = S_MOV_B32 -1 + %7 = COPY %5 + %6 = DS_READ_B32 %7, 0, 0, implicit %m0, implicit %exec + DS_WRITE_B32 %7, %6, 4, 0, implicit killed %m0, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll index 94101f0b92b6..6b1e85915a11 100644 --- a/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -9,11 +9,11 @@ ; should be able to reuse the same regiser for each scratch buffer access. ; GCN-LABEL: {{^}}legal_offset_fi: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+$}} -; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8000 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} -define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { +define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { entry: %scratch0 = alloca [8192 x i32] %scratch1 = alloca [8192 x i32] @@ -49,11 +49,11 @@ done: ; GCN-LABEL: {{^}}legal_offset_fi_offset: ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} ; This constant isn't folded, because it has multiple uses. -; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8000 +; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004 ; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} -define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { +define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { entry: %scratch0 = alloca [8192 x i32] %scratch1 = alloca [8192 x i32] @@ -88,7 +88,7 @@ done: ; GCN-LABEL: {{^}}neg_vaddr_offset: ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}} -define void @neg_vaddr_offset(i32 %offset) { +define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) { entry: %array = alloca [8192 x i32] %ptr_offset = add i32 %offset, 4 @@ -98,8 +98,8 @@ entry: } ; GCN-LABEL: {{^}}pos_vaddr_offset: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16 -define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20 +define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { entry: %array = alloca [8192 x i32] %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4 diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll index bafd6a50ccfe..f9ac425be794 100644 --- a/test/CodeGen/AMDGPU/sdiv.ll +++ b/test/CodeGen/AMDGPU/sdiv.ll @@ -13,7 +13,7 @@ ; FUNC-LABEL: {{^}}sdiv_i32: ; EG: CF_END -define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in %den = load i32, i32 addrspace(1) * %den_ptr @@ -23,7 +23,7 @@ define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { } ; FUNC-LABEL: {{^}}sdiv_i32_4: -define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = sdiv i32 %num, 4 store i32 %result, i32 addrspace(1)* %out @@ -43,14 +43,14 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI: v_add_i32 ; SI: buffer_store_dword ; SI: s_endpgm -define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = sdiv i32 %num, 3435 store i32 %result, i32 addrspace(1)* %out ret void } -define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr @@ -59,14 +59,14 @@ define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i ret void } -define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %num = load <2 x i32>, <2 x i32> addrspace(1) * %in %result = sdiv <2 x i32> %num, store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void } -define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr @@ -75,7 +75,7 @@ define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i ret void } -define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %num = load <4 x i32>, <4 x i32> addrspace(1) * %in %result = sdiv <4 x i32> %num, store <4 x i32> %result, <4 x i32> addrspace(1)* %out @@ -86,7 +86,7 @@ define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* ; SI: v_rcp_f32 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8 ; SI: buffer_store_dword [[BFE]] -define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -100,7 +100,7 @@ define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { ; SI: v_rcp_f32 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23 ; SI: buffer_store_dword [[BFE]] -define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { +define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1 %num = load i23, i23 addrspace(1) * %in %den = load i23, i23 addrspace(1) * %den_ptr @@ -114,7 +114,7 @@ define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { ; SI: v_rcp_f32 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24 ; SI: buffer_store_dword [[BFE]] -define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { +define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1 %num = load i24, i24 addrspace(1) * %in %den = load i24, i24 addrspace(1) * %den_ptr @@ -126,7 +126,7 @@ define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { ; FUNC-LABEL: {{^}}v_sdiv_i25: ; SI-NOT: v_rcp_f32 -define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { +define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1 %num = load i25, i25 addrspace(1) * %in %den = load i25, i25 addrspace(1) * %den_ptr @@ -137,19 +137,19 @@ define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { } ; Tests for 64-bit divide bypass. -; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; %result = sdiv i64 %a, %b ; store i64 %result, i64 addrspace(1)* %out, align 8 ; ret void ; } -; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; %result = srem i64 %a, %b ; store i64 %result, i64 addrspace(1)* %out, align 8 ; ret void ; } -; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; %resultdiv = sdiv i64 %a, %b ; %resultrem = srem i64 %a, %b ; %result = add i64 %resultdiv, %resultrem @@ -163,7 +163,7 @@ define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { ; SI: v_mul_hi_i32 ; SI: v_mul_hi_i32 -define void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { +define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 %2 = sdiv <4 x i32> %1, store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 diff --git a/test/CodeGen/AMDGPU/sdivrem24.ll b/test/CodeGen/AMDGPU/sdivrem24.ll index 349a7821da17..257e6be96b65 100644 --- a/test/CodeGen/AMDGPU/sdivrem24.ll +++ b/test/CodeGen/AMDGPU/sdivrem24.ll @@ -12,7 +12,7 @@ ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -31,7 +31,7 @@ define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in, align 2 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 @@ -50,7 +50,7 @@ define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -69,7 +69,7 @@ define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -88,7 +88,7 @@ define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -107,7 +107,7 @@ define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -130,7 +130,7 @@ define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -149,7 +149,7 @@ define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in, align 2 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 @@ -168,7 +168,7 @@ define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { ; EG-DAG: INT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_INT -define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -187,7 +187,7 @@ define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -206,7 +206,7 @@ define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -225,7 +225,7 @@ define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -244,7 +244,7 @@ define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -263,7 +263,7 @@ define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -283,7 +283,7 @@ define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in ; EG: INT_TO_FLT ; EG: RECIP_IEEE -define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -303,7 +303,7 @@ define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG: INT_TO_FLT ; EG: RECIP_IEEE -define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -323,7 +323,7 @@ define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG: INT_TO_FLT ; EG: RECIP_IEEE -define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 diff --git a/test/CodeGen/AMDGPU/sdivrem64.ll b/test/CodeGen/AMDGPU/sdivrem64.ll index 28fdb69e1ada..5ad0d8efaed3 100644 --- a/test/CodeGen/AMDGPU/sdivrem64.ll +++ b/test/CodeGen/AMDGPU/sdivrem64.ll @@ -70,7 +70,7 @@ ; SI-NOT: v_lshr_b64 ; VI-NOT: v_lshrrev_b64 ; GCN: s_endpgm -define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = sdiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -144,7 +144,7 @@ define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -159,7 +159,7 @@ define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 %result = sdiv i64 %1, %2 @@ -176,7 +176,7 @@ define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 %result = srem i64 %1, %2 @@ -196,7 +196,7 @@ define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 %result = sdiv i64 %1, %2 @@ -216,7 +216,7 @@ define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 %result = srem i64 %1, %2 diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll new file mode 100644 index 000000000000..1e0ac3807528 --- /dev/null +++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -0,0 +1,395 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s + +; GCN-LABEL: {{^}}add_shr_i32: +; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] +; NOSDWA-NOT: v_add_i32_sdwa + +; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %a = load i32, i32 addrspace(1)* %in, align 4 + %shr = lshr i32 %a, 16 + %add = add i32 %a, %shr + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}sub_shr_i32: +; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_subrev_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] +; NOSDWA-NOT: v_subrev_i32_sdwa + +; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %a = load i32, i32 addrspace(1)* %in, align 4 + %shr = lshr i32 %a, 16 + %sub = sub i32 %shr, %a + store i32 %sub, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_shr_i32: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]] +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) { + %a = load i32, i32 addrspace(1)* %in1, align 4 + %b = load i32, i32 addrspace(1)* %in2, align 4 + %shra = lshr i32 %a, 16 + %shrb = lshr i32 %b, 16 + %mul = mul i32 %shra, %shrb + store i32 %mul, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_i16: +; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-NOT: v_mul_u32_u24_sdwa + +define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) { +entry: + %a = load i16, i16 addrspace(1)* %ina, align 4 + %b = load i16, i16 addrspace(1)* %inb, align 4 + %mul = mul i16 %a, %b + store i16 %mul, i16 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v2i16: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %mul = mul <2 x i16> %a, %b + store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v4i16: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { +entry: + %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4 + %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4 + %mul = mul <4 x i16> %a, %b + store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v8i16: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { +entry: + %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4 + %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4 + %mul = mul <8 x i16> %a, %b + store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_half: +; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa +; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-NOT: v_mul_f16_sdwa + +define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) { +entry: + %a = load half, half addrspace(1)* %ina, align 4 + %b = load half, half addrspace(1)* %inb, align 4 + %mul = fmul half %a, %b + store half %mul, half addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v2half: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa + +; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] +define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +entry: + %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 + %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 + %mul = fmul <2 x half> %a, %b + store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v4half: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa + +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { +entry: + %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4 + %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4 + %mul = fmul <4 x half> %a, %b + store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v8half: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_f16_sdwa + +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { +entry: + %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4 + %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4 + %mul = fmul <8 x half> %a, %b + store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_i8: +; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-NOT: v_mul_u32_u24_sdwa + +define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) { +entry: + %a = load i8, i8 addrspace(1)* %ina, align 4 + %b = load i8, i8 addrspace(1)* %inb, align 4 + %mul = mul i8 %a, %b + store i8 %mul, i8 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v2i8: +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 + +define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) { +entry: + %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4 + %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4 + %mul = mul <2 x i8> %a, %b + store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v4i8: +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa + +define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) { +entry: + %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4 + %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4 + %mul = mul <4 x i8> %a, %b + store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}mul_v8i8: +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa + +define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) { +entry: + %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4 + %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4 + %mul = mul <8 x i8> %a, %b + store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16: +; NOSDWA-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; NOSDWA-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_cvt_f32_i32_sdwa + +; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 + +define amdgpu_kernel void @sitofp_v2i16_to_v2f16( + <2 x half> addrspace(1)* %r, + <2 x i16> addrspace(1)* %a) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %r.val = sitofp <2 x i16> %a.val to <2 x half> + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + + +; GCN-LABEL: {{^}}mac_v2half: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA-NOT: v_mac_f16_sdwa + +; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] + +define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +entry: + %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 + %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 + %mul = fmul <2 x half> %a, %b + %mac = fadd <2 x half> %mul, %b + store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}immediate_mul_v2i16: +; NOSDWA-NOT: v_mul_u32_u24_sdwa +; SDWA-NOT: v_mul_u32_u24_sdwa + +define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 + %mul = mul <2 x i16> %a, + store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; Double use of same src - should not convert it +; GCN-LABEL: {{^}}mulmul_v2i16: +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_mul_u32_u24_sdwa + +; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %mul = mul <2 x i16> %a, %b + %mul2 = mul <2 x i16> %mul, %b + store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}add_bb_v2i16: +; NOSDWA-NOT: v_add_i32_sdwa + +; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + br label %add_label +add_label: + %add = add <2 x i16> %a, %b + br label %store_label +store_label: + store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll index 559d464f36a5..c8839c17365e 100644 --- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll +++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll @@ -11,7 +11,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] -define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 { +define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -29,7 +29,7 @@ define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] -define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 { +define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0) diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll index 73dadde884ae..3417eb02b361 100644 --- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}add_select_fabs_fabs_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] @@ -8,7 +8,7 @@ ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]] -define void @add_select_fabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -30,7 +30,7 @@ define void @add_select_fabs_fabs_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]] ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]] -define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -57,7 +57,7 @@ define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 { ; GCN: buffer_store_dword [[ADD]] ; GCN: buffer_store_dword [[X_ABS]] -define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -80,7 +80,7 @@ define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]] ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]] -define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -104,7 +104,7 @@ define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 { ; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_fabs_var_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -123,7 +123,7 @@ define void @add_select_fabs_var_f32(i32 %c) #0 { ; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] -define void @add_select_fabs_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -140,7 +140,7 @@ define void @add_select_fabs_negk_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]] -define void @add_select_fabs_negk_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float -2.0, float -1.0 @@ -155,7 +155,7 @@ define void @add_select_fabs_negk_negk_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] -define void @add_select_posk_posk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float 2.0, float 1.0 @@ -172,7 +172,7 @@ define void @add_select_posk_posk_f32(i32 %c) #0 { ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] -define void @add_select_negk_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -192,7 +192,7 @@ define void @add_select_negk_fabs_f32(i32 %c) #0 { ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] -define void @add_select_negliteralk_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -209,7 +209,7 @@ define void @add_select_negliteralk_fabs_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]] -define void @add_select_fabs_posk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -228,7 +228,7 @@ define void @add_select_fabs_posk_f32(i32 %c) #0 { ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]] -define void @add_select_posk_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -246,7 +246,7 @@ define void @add_select_posk_fabs_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -define void @add_select_fneg_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -268,7 +268,7 @@ define void @add_select_fneg_fneg_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]] -define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -295,7 +295,7 @@ define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 { ; GCN: buffer_store_dword [[ADD]] ; GCN: buffer_store_dword [[NEG_X]] -define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -318,7 +318,7 @@ define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]] -define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -342,7 +342,7 @@ define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 { ; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_fneg_var_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -360,7 +360,7 @@ define void @add_select_fneg_var_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_fneg_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -372,13 +372,13 @@ define void @add_select_fneg_negk_f32(i32 %c) #0 { } ; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_fneg_inv2pi_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -390,15 +390,15 @@ define void @add_select_fneg_inv2pi_f32(i32 %c) #0 { } ; GCN-LABEL: {{^}}add_select_fneg_neginv2pi_f32: -; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] +; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 ; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -415,7 +415,7 @@ define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 { ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] -define void @add_select_negk_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float -2.0, float -1.0 @@ -432,7 +432,7 @@ define void @add_select_negk_negk_f32(i32 %c) #0 { ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] -define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float -2048.0, float -4096.0 @@ -446,7 +446,7 @@ define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] -define void @add_select_fneg_negk_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, float -2.0, float -1.0 @@ -463,7 +463,7 @@ define void @add_select_fneg_negk_negk_f32(i32 %c) #0 { ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_negk_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -480,7 +480,7 @@ define void @add_select_negk_fneg_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_fneg_posk_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -498,7 +498,7 @@ define void @add_select_fneg_posk_f32(i32 %c) #0 { ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] -define void @add_select_posk_fneg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -518,7 +518,7 @@ define void @add_select_posk_fneg_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_negfabs_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -541,7 +541,7 @@ define void @add_select_negfabs_fabs_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_fabs_negfabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -564,7 +564,7 @@ define void @add_select_fabs_negfabs_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_neg_fabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -586,7 +586,7 @@ define void @add_select_neg_fabs_f32(i32 %c) #0 { ; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] -define void @add_select_fabs_neg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -607,7 +607,7 @@ define void @add_select_fabs_neg_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -define void @add_select_neg_negfabs_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -629,7 +629,7 @@ define void @add_select_neg_negfabs_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -define void @add_select_negfabs_neg_f32(i32 %c) #0 { +define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -651,7 +651,7 @@ define void @add_select_negfabs_neg_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] -define void @mul_select_negfabs_posk_f32(i32 %c) #0 { +define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -672,7 +672,7 @@ define void @mul_select_negfabs_posk_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] -define void @mul_select_posk_negfabs_f32(i32 %c) #0 { +define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -690,7 +690,7 @@ define void @mul_select_posk_negfabs_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] -define void @mul_select_negfabs_negk_f32(i32 %c) #0 { +define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -709,7 +709,7 @@ define void @mul_select_negfabs_negk_f32(i32 %c) #0 { ; GCN: v_cmp_ne_u32_e64 vcc ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] -define void @mul_select_negk_negfabs_f32(i32 %c) #0 { +define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -732,7 +732,7 @@ define void @mul_select_negk_negfabs_f32(i32 %c) #0 { ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_add_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_add_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -749,7 +749,7 @@ define void @select_fneg_posk_src_add_f32(i32 %c) #0 { ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_sub_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_sub_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %add = fsub float %x, 4.0 @@ -765,7 +765,7 @@ define void @select_fneg_posk_src_sub_f32(i32 %c) #0 { ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_mul_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_mul_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 %mul = fmul float %x, 4.0 @@ -782,7 +782,7 @@ define void @select_fneg_posk_src_mul_f32(i32 %c) #0 { ; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_fma_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_fma_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -799,7 +799,7 @@ define void @select_fneg_posk_src_fma_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc ; GCN-NEXT: buffer_store_dword [[SELECT]] -define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -818,7 +818,7 @@ define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 { ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] -define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 { +define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 diff --git a/test/CodeGen/AMDGPU/select-i1.ll b/test/CodeGen/AMDGPU/select-i1.ll index 07dcb2153384..5eaad1f363f9 100644 --- a/test/CodeGen/AMDGPU/select-i1.ll +++ b/test/CodeGen/AMDGPU/select-i1.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: {{^}}select_i1: ; SI: v_cndmask_b32 ; SI-NOT: v_cndmask_b32 -define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { +define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i1 %a, i1 %b store i1 %sel, i1 addrspace(1)* %out, align 4 @@ -19,7 +19,7 @@ define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind ; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 ; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]] ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] -define void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { +define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { %cmp = icmp slt i1 %cond, false %sel = select i1 %cmp, i1 %a, i1 %b store i1 %sel, i1 addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/select-opt.ll b/test/CodeGen/AMDGPU/select-opt.ll index ad358d33c405..d56b952118b5 100644 --- a/test/CodeGen/AMDGPU/select-opt.ll +++ b/test/CodeGen/AMDGPU/select-opt.ll @@ -11,7 +11,7 @@ ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %and = and i1 %icmp0, %icmp1 @@ -27,7 +27,7 @@ define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %and = and i1 %fcmp0, %fcmp1 @@ -43,7 +43,7 @@ define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} -define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { +define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %and = and i1 %icmp0, %icmp1 @@ -59,7 +59,7 @@ define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} -define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { +define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %and = and i1 %fcmp0, %fcmp1 @@ -76,7 +76,7 @@ define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] ; GCN: s_endpgm -define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %or = or i1 %icmp0, %icmp1 @@ -92,7 +92,7 @@ define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] -define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %or = or i1 %fcmp0, %fcmp1 @@ -108,7 +108,7 @@ define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float % ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} -define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { +define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %or = or i1 %icmp0, %icmp1 @@ -124,7 +124,7 @@ define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}} -define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { +define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %or = or i1 %fcmp0, %fcmp1 @@ -138,7 +138,7 @@ define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float % ; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} -define void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { +define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { entry: %cmp0 = fcmp oeq float %c0, 1.0 br i1 %cmp0, label %if0, label %endif diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll index 759abe2f2e9a..8710fc8c7307 100644 --- a/test/CodeGen/AMDGPU/select-vectors.ll +++ b/test/CodeGen/AMDGPU/select-vectors.ll @@ -10,7 +10,7 @@ ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { +define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { %cmp = icmp eq i8 %c, 0 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 @@ -22,7 +22,7 @@ define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 @@ -36,7 +36,7 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx2 -define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { +define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 @@ -49,7 +49,7 @@ define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx4 -define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { +define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 @@ -64,7 +64,7 @@ define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; SI: buffer_store_dwordx4 -define void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { +define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { bb: %tmp2 = icmp ult i32 %cond, 32 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -82,7 +82,7 @@ bb: ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 @@ -102,7 +102,7 @@ define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> ; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx2 -define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { +define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 @@ -120,7 +120,7 @@ define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx4 -define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { +define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 @@ -135,7 +135,7 @@ define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; SI: buffer_store_dwordx4 -define void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { +define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { bb: %tmp2 = icmp ult i32 %cond, 32 %val = load <4 x float>, <4 x float> addrspace(1)* %in @@ -153,7 +153,7 @@ bb: ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 @@ -165,7 +165,7 @@ define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x f ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 @@ -181,7 +181,7 @@ define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 @@ -205,7 +205,7 @@ define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 ; SI: v_cndmask_b32_e32 -define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { +define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll index 19fe8d9b2326..2a7a9c9e0638 100644 --- a/test/CodeGen/AMDGPU/select.f16.ll +++ b/test/CodeGen/AMDGPU/select.f16.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; GCN-LABEL: {{^}}select_f16 +; GCN-LABEL: {{^}}select_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] @@ -17,7 +17,7 @@ ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16( +define amdgpu_kernel void @select_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -34,13 +34,12 @@ entry: ret void } -; GCN-LABEL: {{^}}select_f16_imm_a +; GCN-LABEL: {{^}}select_f16_imm_a: ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3800{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_gt_f32_e32 vcc, v[[B_F32]], v[[A_F32]] +; SI: v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] @@ -49,7 +48,7 @@ entry: ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16_imm_a( +define amdgpu_kernel void @select_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b, half addrspace(1)* %c, @@ -64,22 +63,22 @@ entry: ret void } -; GCN-LABEL: {{^}}select_f16_imm_b +; GCN-LABEL: {{^}}select_f16_imm_b: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x3800{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; SI: v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] + ; VI: v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]] ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16_imm_b( +define amdgpu_kernel void @select_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %c, @@ -94,23 +93,23 @@ entry: ret void } -; GCN-LABEL: {{^}}select_f16_imm_c +; GCN-LABEL: {{^}}select_f16_imm_c: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x3800{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[C_F32]], v[[D_F32]], vcc +; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] + ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}} ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16_imm_c( +define amdgpu_kernel void @select_f16_imm_c( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -125,23 +124,22 @@ entry: ret void } -; GCN-LABEL: {{^}}select_f16_imm_d +; GCN-LABEL: {{^}}select_f16_imm_d: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], 0x3800{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] +; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}} ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @select_f16_imm_d( +define amdgpu_kernel void @select_f16_imm_d( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -156,21 +154,25 @@ entry: ret void } -; GCN-LABEL: {{^}}select_v2f16 -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 -; SI: v_cmp_lt_f32_e32 -; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 -; SI: v_cvt_f16_f32_e32 -; SI: v_cvt_f16_f32_e32 +; GCN-LABEL: {{^}}select_v2f16: +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cmp_lt_f32_e64 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e64 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 + +; VI: v_cmp_lt_f16_e64 +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32_e64 +; VI: v_cndmask_b32_e32 + ; GCN: s_endpgm -define void @select_v2f16( +define amdgpu_kernel void @select_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -187,25 +189,24 @@ entry: ret void } -; GCN-LABEL: {{^}}select_v2f16_imm_a +; GCN-LABEL: {{^}}select_v2f16_imm_a: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_gt_f32_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_gt_f32_e64 +; SI: v_cmp_lt_f32_e64 +; SI: v_cmp_lt_f32_e32 vcc, 0.5 + ; VI: v_cmp_lt_f16_e32 ; VI: v_cmp_lt_f16_e64 ; GCN: v_cndmask_b32_e32 -; SI: v_cvt_f16_f32_e32 ; GCN: v_cndmask_b32_e64 ; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm -define void @select_v2f16_imm_a( +define amdgpu_kernel void @select_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b, <2 x half> addrspace(1)* %c, @@ -220,25 +221,25 @@ entry: ret void } -; GCN-LABEL: {{^}}select_v2f16_imm_b +; GCN-LABEL: {{^}}select_v2f16_imm_b: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 +; SI: v_cmp_gt_f32_e64 +; SI: v_cmp_gt_f32_e32 vcc, 0.5 + ; VI: v_cmp_gt_f16_e32 ; VI: v_cmp_gt_f16_e64 ; GCN: v_cndmask_b32_e32 -; SI: v_cvt_f16_f32_e32 ; GCN: v_cndmask_b32_e64 + +; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm -define void @select_v2f16_imm_b( +define amdgpu_kernel void @select_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %c, @@ -253,9 +254,7 @@ entry: ret void } -; GCN-LABEL: {{^}}select_v2f16_imm_c -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 +; GCN-LABEL: {{^}}select_v2f16_imm_c: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 @@ -263,10 +262,10 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e32 -; SI: v_cmp_lt_f32_e64 -; SI: v_cndmask_b32_e32 +; SI: v_cmp_nlt_f32_e32 +; SI: v_cmp_nlt_f32_e64 ; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 ; VI: v_cmp_nlt_f16_e32 ; VI: v_cndmask_b32_e32 @@ -277,7 +276,7 @@ entry: ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm -define void @select_v2f16_imm_c( +define amdgpu_kernel void @select_v2f16_imm_c( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -292,25 +291,24 @@ entry: ret void } -; GCN-LABEL: {{^}}select_v2f16_imm_d -; SI: v_cvt_f32_f16_e32 +; GCN-LABEL: {{^}}select_v2f16_imm_d: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e32 ; SI: v_cmp_lt_f32_e64 +; SI: v_cmp_lt_f32_e32 + ; VI: v_cmp_lt_f16_e32 ; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm -define void @select_v2f16_imm_d( +define amdgpu_kernel void @select_v2f16_imm_d( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, diff --git a/test/CodeGen/AMDGPU/select.ll b/test/CodeGen/AMDGPU/select.ll index 45f3cd5a7ac5..e53c159a2f71 100644 --- a/test/CodeGen/AMDGPU/select.ll +++ b/test/CodeGen/AMDGPU/select.ll @@ -14,7 +14,7 @@ ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, +define amdgpu_kernel void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out, <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out, i32 %cond) { diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll index a68fdecb00af..3b4c925a87a0 100644 --- a/test/CodeGen/AMDGPU/select64.ll +++ b/test/CodeGen/AMDGPU/select64.ll @@ -7,7 +7,7 @@ ; CHECK-NOT: s_lshr_b64 ; CHECK: v_cndmask ; CHECK: v_cndmask -define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { +define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { entry: %0 = icmp ugt i32 %cond, 5 %1 = select i1 %0, i64 0, i64 %in @@ -18,7 +18,7 @@ entry: ; CHECK-LABEL: {{^}}select_trunc_i64: ; CHECK: v_cndmask_b32 ; CHECK-NOT: v_cndmask_b32 -define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { +define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i64 0, i64 %in %trunc = trunc i64 %sel to i32 @@ -29,7 +29,7 @@ define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwi ; CHECK-LABEL: {{^}}select_trunc_i64_2: ; CHECK: v_cndmask_b32 ; CHECK-NOT: v_cndmask_b32 -define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i64 %a, i64 %b %trunc = trunc i64 %sel to i32 @@ -40,7 +40,7 @@ define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 % ; CHECK-LABEL: {{^}}v_select_trunc_i64_2: ; CHECK: v_cndmask_b32 ; CHECK-NOT: v_cndmask_b32 -define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %cmp = icmp ugt i32 %cond, 5 %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 @@ -54,7 +54,7 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}} ; CHECK: s_endpgm -define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %cmp = icmp ugt i32 %cond, 5 %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 diff --git a/test/CodeGen/AMDGPU/selectcc-cnd.ll b/test/CodeGen/AMDGPU/selectcc-cnd.ll index 94d0ace75697..18616851c9c2 100644 --- a/test/CodeGen/AMDGPU/selectcc-cnd.ll +++ b/test/CodeGen/AMDGPU/selectcc-cnd.ll @@ -3,7 +3,7 @@ ;CHECK-NOT: SETE ;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x, ;CHECK: 1073741824 -define void @test(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, float addrspace(1)* %in) { %1 = load float, float addrspace(1)* %in %2 = fcmp oeq float %1, 0.0 %3 = select i1 %2, float 1.0, float 2.0 diff --git a/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll index 58a4ee7d62b2..1504165d3d2b 100644 --- a/test/CodeGen/AMDGPU/selectcc-cnde-int.ll +++ b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll @@ -3,7 +3,7 @@ ;CHECK-NOT: SETE_INT ;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x, ;CHECK-NEXT: 2 -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %1 = load i32, i32 addrspace(1)* %in %2 = icmp eq i32 %1, 0 %3 = select i1 %2, i32 1, i32 2 diff --git a/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll index e870ee891e66..7af5478600bb 100644 --- a/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll +++ b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll @@ -6,7 +6,7 @@ ; CHECK-NEXT: -1 ; Test a selectcc with i32 LHS/RHS and float True/False -define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = load i32, i32 addrspace(1)* %in %1 = icmp sge i32 %0, 0 diff --git a/test/CodeGen/AMDGPU/selectcc-opt.ll b/test/CodeGen/AMDGPU/selectcc-opt.ll index 0f46d4c7ea06..8fef3f8b3808 100644 --- a/test/CodeGen/AMDGPU/selectcc-opt.ll +++ b/test/CodeGen/AMDGPU/selectcc-opt.ll @@ -7,7 +7,7 @@ ; EG-NOT: CND ; EG: SET{{[NEQGTL]+}}_DX10 -define void @test_a(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 0.000000e+00 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -35,7 +35,7 @@ ENDIF: ; EG: SET{{[GTEQN]+}}_DX10 ; EG-NEXT: PRED_ ; EG-NEXT: ALU clause starting -define void @test_b(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @test_b(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 0.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -59,7 +59,7 @@ ENDIF: ; Test a CND*_INT instruction with float true/false values ; EG-LABEL: {{^}}test_c: ; EG: CND{{[GTE]+}}_INT -define void @test_c(float addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @test_c(float addrspace(1)* %out, i32 %in) { entry: %0 = icmp sgt i32 %in, 0 %1 = select i1 %0, float 2.0, float 3.0 @@ -72,7 +72,7 @@ entry: ; SI-NEXT: v_cndmask_b32_e64 ; SI-NOT: cmp ; SI-NOT: cndmask -define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = select i1 %icmp0, i32 -1, i32 0 store i32 %ext, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/selectcc.ll b/test/CodeGen/AMDGPU/selectcc.ll index 446d4ab344b2..7eca22913987 100644 --- a/test/CodeGen/AMDGPU/selectcc.ll +++ b/test/CodeGen/AMDGPU/selectcc.ll @@ -11,7 +11,7 @@ ; SI: v_cmp_eq_u64 ; SI: v_cndmask ; SI: v_cndmask -define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { +define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { entry: %0 = icmp eq i64 %lhs, %rhs %1 = select i1 %0, i64 %true, i64 %false diff --git a/test/CodeGen/AMDGPU/selected-stack-object.ll b/test/CodeGen/AMDGPU/selected-stack-object.ll index 37f2747d9815..50ca59ace94e 100644 --- a/test/CodeGen/AMDGPU/selected-stack-object.ll +++ b/test/CodeGen/AMDGPU/selected-stack-object.ll @@ -1,4 +1,4 @@ -; "Assertion failure" should be caught with both XFAIL:* and +Asserts. +; "Assertion failure" should be caught with both XFAIL * and +Asserts. ; XFAIL: * ; REQUIRES: asserts diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll index 57365a6e1fc3..6867c6394937 100644 --- a/test/CodeGen/AMDGPU/set-dx10.ll +++ b/test/CodeGen/AMDGPU/set-dx10.ll @@ -8,7 +8,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp une float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -22,7 +22,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp une float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -34,7 +34,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oeq float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -48,7 +48,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oeq float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -60,7 +60,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp ogt float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -74,7 +74,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp ogt float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -86,7 +86,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oge float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -100,7 +100,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp oge float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -112,7 +112,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp ole float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -126,7 +126,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp ole float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 @@ -138,7 +138,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 @@ -152,7 +152,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z, ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) { +define amdgpu_kernel void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 5.0 %1 = select i1 %0, i32 -1, i32 0 diff --git a/test/CodeGen/AMDGPU/setcc-equivalent.ll b/test/CodeGen/AMDGPU/setcc-equivalent.ll index 11ea793650c4..853afa8772ea 100644 --- a/test/CodeGen/AMDGPU/setcc-equivalent.ll +++ b/test/CodeGen/AMDGPU/setcc-equivalent.ll @@ -3,7 +3,7 @@ ; EG-LABEL: {{^}}and_setcc_setcc_i32: ; EG: AND_INT ; EG-NEXT: SETE_INT -define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %cmp1 = icmp eq i32 %a, -1 %cmp2 = icmp eq i32 %b, -1 %and = and i1 %cmp1, %cmp2 @@ -20,7 +20,7 @@ define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { ; EG: SETE_INT ; EG: AND_INT ; EG: SETE_INT -define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { +define amdgpu_kernel void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { %cmp1 = icmp eq <4 x i32> %a, %cmp2 = icmp eq <4 x i32> %b, %and = and <4 x i1> %cmp1, %cmp2 diff --git a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll new file mode 100644 index 000000000000..8d455d84bf9e --- /dev/null +++ b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll @@ -0,0 +1,258 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s + +; Test fcmp pred (fneg x), c -> fcmp (swapped pred) x, -c combine. + +; GCN-LABEL: {{^}}multi_use_fneg_src: +; GCN: buffer_load_dword [[A:v[0-9]+]] +; GCN: buffer_load_dword [[B:v[0-9]+]] +; GCN: buffer_load_dword [[C:v[0-9]+]] + +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]] +; GCN: buffer_store_dword [[MUL]] +define amdgpu_kernel void @multi_use_fneg_src() #0 { + %a = load volatile float, float addrspace(1)* undef + %b = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + + %mul = fmul float %a, %b + %neg.mul = fsub float -0.0, %mul + %cmp = fcmp oeq float %neg.mul, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + store volatile float %mul, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}multi_foldable_use_fneg_src: +; GCN: buffer_load_dword [[A:v[0-9]+]] +; GCN: buffer_load_dword [[B:v[0-9]+]] +; GCN: buffer_load_dword [[C:v[0-9]+]] + +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]] +; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]] +define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 { + %a = load volatile float, float addrspace(1)* undef + %b = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + + %mul = fmul float %a, %b + %neg.mul = fsub float -0.0, %mul + %use1 = fmul float %mul, %neg.mul + %cmp = fcmp oeq float %neg.mul, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + + store volatile i32 %select, i32 addrspace(1)* undef + store volatile float %use1, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}multi_use_fneg: +; GCN: buffer_load_dword [[A:v[0-9]+]] +; GCN: buffer_load_dword [[B:v[0-9]+]] +; GCN: buffer_load_dword [[C:v[0-9]+]] + +; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 4.0, [[MUL]] +; GCN-NOT: xor +; GCN: buffer_store_dword [[MUL]] +define amdgpu_kernel void @multi_use_fneg() #0 { + %a = load volatile float, float addrspace(1)* undef + %b = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + + %mul = fmul float %a, %b + %neg.mul = fsub float -0.0, %mul + %cmp = fcmp oeq float %neg.mul, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + store volatile float %neg.mul, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}multi_foldable_use_fneg: +; GCN: buffer_load_dword [[A:v[0-9]+]] +; GCN: buffer_load_dword [[B:v[0-9]+]] + +; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]] +; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]] +; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]] +; GCN: buffer_store_dword [[MUL1]] +define amdgpu_kernel void @multi_foldable_use_fneg() #0 { + %a = load volatile float, float addrspace(1)* undef + %b = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %z = load volatile i32, i32 addrspace(1)* undef + + %mul = fmul float %a, %b + %neg.mul = fsub float -0.0, %mul + %cmp = fcmp oeq float %neg.mul, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + %use1 = fmul float %neg.mul, %mul + store volatile i32 %select, i32 addrspace(1)* undef + store volatile float %use1, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_oeq_posk_f32: +; GCN: v_cmp_eq_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_oeq_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp oeq float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_ogt_posk_f32: +; GCN: v_cmp_gt_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_ogt_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp ogt float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_oge_posk_f32: +; GCN: v_cmp_ge_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_oge_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp oge float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_olt_posk_f32: +; GCN: v_cmp_lt_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_olt_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp olt float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_ole_posk_f32: +; GCN: v_cmp_le_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_ole_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp ole float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_one_posk_f32: +; GCN: v_cmp_lg_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_one_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp one float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_ueq_posk_f32: +; GCN: v_cmp_nlg_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_ueq_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp ueq float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_ugt_posk_f32: +; GCN: v_cmp_nle_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_ugt_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp ugt float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_uge_posk_f32: +; GCN: v_cmp_nlt_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_uge_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp uge float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_ult_posk_f32: +; GCN: v_cmp_nge_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_ult_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp ult float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_ule_posk_f32: +; GCN: v_cmp_ngt_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_ule_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp ule float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_setcc_fneg_une_posk_f32: +; GCN: v_cmp_neq_f32_e32 vcc, -4.0, v{{[0-9]+}} +define amdgpu_kernel void @test_setcc_fneg_une_posk_f32() #0 { + %a = load volatile float, float addrspace(1)* undef + %x = load volatile i32, i32 addrspace(1)* undef + %y = load volatile i32, i32 addrspace(1)* undef + %neg.a = fsub float -0.0, %a + %cmp = fcmp une float %neg.a, 4.0 + %select = select i1 %cmp, i32 %x, i32 %y + store volatile i32 %select, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll index 4ab6da085634..caddb6f68218 100644 --- a/test/CodeGen/AMDGPU/setcc-opt.ll +++ b/test/CodeGen/AMDGPU/setcc-opt.ll @@ -11,7 +11,7 @@ ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 -define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 0 @@ -28,7 +28,7 @@ define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 -define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 0 @@ -42,7 +42,7 @@ define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, -1 @@ -56,7 +56,7 @@ define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, -1 @@ -70,7 +70,7 @@ define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 0 @@ -84,7 +84,7 @@ define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 0 @@ -98,7 +98,7 @@ define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 1 @@ -111,7 +111,7 @@ define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; GCN: v_cmp_eq_u32_e32 vcc, ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN-NEXT: buffer_store_byte [[RESULT]] -define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 1 @@ -124,7 +124,7 @@ define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0{{$}} ; GCN: buffer_store_byte [[TMP]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, -1 @@ -137,7 +137,7 @@ define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[TMP]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, -1 @@ -159,7 +159,7 @@ define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { +define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { %b.ext = zext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, 255 store i1 %icmp0, i1 addrspace(1)* %out @@ -172,7 +172,7 @@ define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind { +define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind { %b = load i8, i8 addrspace(1)* %b.ptr %b.ext = sext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, -1 @@ -186,7 +186,7 @@ define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nou ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]] ; GCN-NEXT: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { +define amdgpu_kernel void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { %b.ext = sext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, -1 store i1 %icmp0, i1 addrspace(1)* %out @@ -207,7 +207,7 @@ define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) n ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { +define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { %b.ext = sext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, -1 store i1 %icmp0, i1 addrspace(1)* %out @@ -218,7 +218,7 @@ define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm -define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind { +define amdgpu_kernel void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind { %b.ext = zext i8 %b to i32 %icmp0 = icmp ne i32 %b.ext, -1 store i1 %icmp0, i1 addrspace(1)* %out @@ -229,7 +229,7 @@ define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind { ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 2 @@ -241,7 +241,7 @@ define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; GCN: buffer_store_byte [[RESULT]] ; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = zext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 2 @@ -256,7 +256,7 @@ define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}} ; GCN: buffer_store_byte [[K]] -define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp eq i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp eq i32 %ext, 1 @@ -267,7 +267,7 @@ define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[K]] -define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 1 @@ -278,7 +278,7 @@ define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}} ; GCN: buffer_store_byte [[K]] -define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %icmp0 = icmp ne i32 %a, %b %ext = sext i1 %icmp0 to i32 %icmp1 = icmp ne i32 %ext, 2 diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll index 10d04bab9f6b..add90e9c2f3a 100644 --- a/test/CodeGen/AMDGPU/setcc.ll +++ b/test/CodeGen/AMDGPU/setcc.ll @@ -9,7 +9,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; GCN-DAG: v_cmp_eq_u32_e32 ; GCN-DAG: v_cmp_eq_u32_e64 -define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %result = icmp eq <2 x i32> %a, %b %sext = sext <2 x i1> %result to <2 x i32> store <2 x i32> %sext, <2 x i32> addrspace(1)* %out @@ -26,7 +26,7 @@ define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> % ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cmp_eq_u32_e64 -define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -43,7 +43,7 @@ define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* % ; FUNC-LABEL: {{^}}f32_oeq: ; R600: SETE_DX10 ; GCN: v_cmp_eq_f32 -define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp oeq float %a, %b %1 = sext i1 %0 to i32 @@ -54,7 +54,7 @@ entry: ; FUNC-LABEL: {{^}}f32_ogt: ; R600: SETGT_DX10 ; GCN: v_cmp_gt_f32 -define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ogt float %a, %b %1 = sext i1 %0 to i32 @@ -65,7 +65,7 @@ entry: ; FUNC-LABEL: {{^}}f32_oge: ; R600: SETGE_DX10 ; GCN: v_cmp_ge_f32 -define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp oge float %a, %b %1 = sext i1 %0 to i32 @@ -76,7 +76,7 @@ entry: ; FUNC-LABEL: {{^}}f32_olt: ; R600: SETGT_DX10 ; GCN: v_cmp_lt_f32 -define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp olt float %a, %b %1 = sext i1 %0 to i32 @@ -87,7 +87,7 @@ entry: ; FUNC-LABEL: {{^}}f32_ole: ; R600: SETGE_DX10 ; GCN: v_cmp_le_f32 -define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ole float %a, %b %1 = sext i1 %0 to i32 @@ -105,7 +105,7 @@ entry: ; GCN: v_cmp_lg_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp one float %a, %b %1 = sext i1 %0 to i32 @@ -119,7 +119,7 @@ entry: ; R600-DAG: AND_INT ; R600-DAG: SETNE_INT ; GCN: v_cmp_o_f32 -define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ord float %a, %b %1 = sext i1 %0 to i32 @@ -137,7 +137,7 @@ entry: ; GCN: v_cmp_nlg_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ueq float %a, %b %1 = sext i1 %0 to i32 @@ -150,7 +150,7 @@ entry: ; R600: SETE_DX10 ; GCN: v_cmp_nle_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ugt float %a, %b %1 = sext i1 %0 to i32 @@ -164,7 +164,7 @@ entry: ; GCN: v_cmp_nlt_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp uge float %a, %b %1 = sext i1 %0 to i32 @@ -178,7 +178,7 @@ entry: ; GCN: v_cmp_nge_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ult float %a, %b %1 = sext i1 %0 to i32 @@ -192,7 +192,7 @@ entry: ; GCN: v_cmp_ngt_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp ule float %a, %b %1 = sext i1 %0 to i32 @@ -203,7 +203,7 @@ entry: ; FUNC-LABEL: {{^}}f32_une: ; R600: SETNE_DX10 ; GCN: v_cmp_neq_f32 -define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp une float %a, %b %1 = sext i1 %0 to i32 @@ -217,7 +217,7 @@ entry: ; R600: OR_INT ; R600: SETNE_INT ; GCN: v_cmp_u_f32 -define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 { entry: %0 = fcmp uno float %a, %b %1 = sext i1 %0 to i32 @@ -232,7 +232,7 @@ entry: ; FUNC-LABEL: {{^}}i32_eq: ; R600: SETE_INT ; GCN: v_cmp_eq_u32 -define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp eq i32 %a, %b %1 = sext i1 %0 to i32 @@ -243,7 +243,7 @@ entry: ; FUNC-LABEL: {{^}}i32_ne: ; R600: SETNE_INT ; GCN: v_cmp_ne_u32 -define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ne i32 %a, %b %1 = sext i1 %0 to i32 @@ -254,7 +254,7 @@ entry: ; FUNC-LABEL: {{^}}i32_ugt: ; R600: SETGT_UINT ; GCN: v_cmp_gt_u32 -define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ugt i32 %a, %b %1 = sext i1 %0 to i32 @@ -265,7 +265,7 @@ entry: ; FUNC-LABEL: {{^}}i32_uge: ; R600: SETGE_UINT ; GCN: v_cmp_ge_u32 -define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp uge i32 %a, %b %1 = sext i1 %0 to i32 @@ -276,7 +276,7 @@ entry: ; FUNC-LABEL: {{^}}i32_ult: ; R600: SETGT_UINT ; GCN: v_cmp_lt_u32 -define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ult i32 %a, %b %1 = sext i1 %0 to i32 @@ -287,7 +287,7 @@ entry: ; FUNC-LABEL: {{^}}i32_ule: ; R600: SETGE_UINT ; GCN: v_cmp_le_u32 -define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp ule i32 %a, %b %1 = sext i1 %0 to i32 @@ -298,7 +298,7 @@ entry: ; FUNC-LABEL: {{^}}i32_sgt: ; R600: SETGT_INT ; GCN: v_cmp_gt_i32 -define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp sgt i32 %a, %b %1 = sext i1 %0 to i32 @@ -309,7 +309,7 @@ entry: ; FUNC-LABEL: {{^}}i32_sge: ; R600: SETGE_INT ; GCN: v_cmp_ge_i32 -define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp sge i32 %a, %b %1 = sext i1 %0 to i32 @@ -320,7 +320,7 @@ entry: ; FUNC-LABEL: {{^}}i32_slt: ; R600: SETGT_INT ; GCN: v_cmp_lt_i32 -define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp slt i32 %a, %b %1 = sext i1 %0 to i32 @@ -331,7 +331,7 @@ entry: ; FUNC-LABEL: {{^}}i32_sle: ; R600: SETGE_INT ; GCN: v_cmp_le_i32 -define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %0 = icmp sle i32 %a, %b %1 = sext i1 %0 to i32 @@ -348,7 +348,7 @@ entry: ; GCN-DAG: v_cmp_eq_u32 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, ; GCN: s_endpgm -define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 { +define amdgpu_kernel void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid @@ -369,7 +369,7 @@ define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptr ; GCN-DAG: v_cmp_eq_u32 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, ; GCN: s_endpgm -define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 { +define amdgpu_kernel void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid @@ -386,7 +386,7 @@ define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, ; FUNC-LABEL: setcc-i1 ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1 ; GCN: s_cmp_eq_u32 [[AND]], 0 -define void @setcc-i1(i32 %in) #0 { +define amdgpu_kernel void @setcc-i1(i32 %in) #0 { %and = and i32 %in, 1 %cmp = icmp eq i32 %and, 0 br i1 %cmp, label %endif, label %if @@ -400,7 +400,7 @@ endif: ; GCN-DAG: v_cmp_ge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}} ; GCN-DAG: v_cmp_le_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 ; GCN: s_and_b64 s[2:3], [[A]], [[B]] -define void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 { +define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 { bb0: %tmp5 = fcmp oge float %cond, 0.000000e+00 %tmp7 = fcmp ole float %cond, 1.000000e+00 diff --git a/test/CodeGen/AMDGPU/setcc64.ll b/test/CodeGen/AMDGPU/setcc64.ll index 1f86277e0bc6..1f1bdb055302 100644 --- a/test/CodeGen/AMDGPU/setcc64.ll +++ b/test/CodeGen/AMDGPU/setcc64.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: {{^}}f64_oeq: ; GCN: v_cmp_eq_f64 -define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp oeq double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -19,7 +19,7 @@ entry: ; GCN-LABEL: {{^}}f64_ogt: ; GCN: v_cmp_gt_f64 -define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ogt double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -29,7 +29,7 @@ entry: ; GCN-LABEL: {{^}}f64_oge: ; GCN: v_cmp_ge_f64 -define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp oge double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -39,7 +39,7 @@ entry: ; GCN-LABEL: {{^}}f64_olt: ; GCN: v_cmp_lt_f64 -define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp olt double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -49,7 +49,7 @@ entry: ; GCN-LABEL: {{^}}f64_ole: ; GCN: v_cmp_le_f64 -define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ole double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -60,7 +60,7 @@ entry: ; GCN-LABEL: {{^}}f64_one: ; GCN: v_cmp_lg_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp one double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -70,7 +70,7 @@ entry: ; GCN-LABEL: {{^}}f64_ord: ; GCN: v_cmp_o_f64 -define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ord double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -81,7 +81,7 @@ entry: ; GCN-LABEL: {{^}}f64_ueq: ; GCN: v_cmp_nlg_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ueq double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -93,7 +93,7 @@ entry: ; GCN: v_cmp_nle_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ugt double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -104,7 +104,7 @@ entry: ; GCN-LABEL: {{^}}f64_uge: ; GCN: v_cmp_nlt_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp uge double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -115,7 +115,7 @@ entry: ; GCN-LABEL: {{^}}f64_ult: ; GCN: v_cmp_nge_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ult double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -126,7 +126,7 @@ entry: ; GCN-LABEL: {{^}}f64_ule: ; GCN: v_cmp_ngt_f64_e32 vcc ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp ule double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -136,7 +136,7 @@ entry: ; GCN-LABEL: {{^}}f64_une: ; GCN: v_cmp_neq_f64 -define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp une double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -146,7 +146,7 @@ entry: ; GCN-LABEL: {{^}}f64_uno: ; GCN: v_cmp_u_f64 -define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 { entry: %tmp0 = fcmp uno double %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -160,7 +160,7 @@ entry: ; GCN-LABEL: {{^}}i64_eq: ; GCN: v_cmp_eq_u64 -define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp eq i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -170,7 +170,7 @@ entry: ; GCN-LABEL: {{^}}i64_ne: ; GCN: v_cmp_ne_u64 -define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp ne i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -180,7 +180,7 @@ entry: ; GCN-LABEL: {{^}}i64_ugt: ; GCN: v_cmp_gt_u64 -define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp ugt i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -190,7 +190,7 @@ entry: ; GCN-LABEL: {{^}}i64_uge: ; GCN: v_cmp_ge_u64 -define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp uge i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -200,7 +200,7 @@ entry: ; GCN-LABEL: {{^}}i64_ult: ; GCN: v_cmp_lt_u64 -define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp ult i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -210,7 +210,7 @@ entry: ; GCN-LABEL: {{^}}i64_ule: ; GCN: v_cmp_le_u64 -define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp ule i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -220,7 +220,7 @@ entry: ; GCN-LABEL: {{^}}i64_sgt: ; GCN: v_cmp_gt_i64 -define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp sgt i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -230,7 +230,7 @@ entry: ; GCN-LABEL: {{^}}i64_sge: ; GCN: v_cmp_ge_i64 -define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp sge i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -240,7 +240,7 @@ entry: ; GCN-LABEL: {{^}}i64_slt: ; GCN: v_cmp_lt_i64 -define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp slt i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 @@ -250,7 +250,7 @@ entry: ; GCN-LABEL: {{^}}i64_sle: ; GCN: v_cmp_le_i64 -define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 { entry: %tmp0 = icmp sle i64 %a, %b %tmp1 = sext i1 %tmp0 to i32 diff --git a/test/CodeGen/AMDGPU/seto.ll b/test/CodeGen/AMDGPU/seto.ll index 01e4a7fda5d2..b4385aa0ccca 100644 --- a/test/CodeGen/AMDGPU/seto.ll +++ b/test/CodeGen/AMDGPU/seto.ll @@ -4,12 +4,9 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] -define void @main(float %p) { +define amdgpu_ps float @main(float inreg %p) { main_body: %c = fcmp oeq float %p, %p %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) - ret void + ret float %r } - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/setuo.ll b/test/CodeGen/AMDGPU/setuo.ll index 76346c4f624a..f6821b675e22 100644 --- a/test/CodeGen/AMDGPU/setuo.ll +++ b/test/CodeGen/AMDGPU/setuo.ll @@ -4,12 +4,9 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] -define void @main(float %p) { +define amdgpu_ps float @main(float inreg %p) { main_body: %c = fcmp une float %p, %p %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) - ret void + ret float %r } - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/sext-eliminate.ll b/test/CodeGen/AMDGPU/sext-eliminate.ll index 7dc6eb87f6b5..0b780af17bca 100644 --- a/test/CodeGen/AMDGPU/sext-eliminate.ll +++ b/test/CodeGen/AMDGPU/sext-eliminate.ll @@ -6,7 +6,7 @@ ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] ; EG: SUB_INT {{[* ]*}}[[RES]] ; EG-NOT: BFE -define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) { +define amdgpu_kernel void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) { %sext = sext i1 %a to i32 %res = add i32 %b, %sext store i32 %res, i32 addrspace(1)* %out @@ -18,7 +18,7 @@ define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) { ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] ; EG: ADD_INT {{[* ]*}}[[RES]] ; EG-NOT: BFE -define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) { +define amdgpu_kernel void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) { %sext = sext i1 %a to i32 %res = sub i32 %b, %sext store i32 %res, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll index adba6bbb51d4..7ac4e1d9fe4b 100644 --- a/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll +++ b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll @@ -11,7 +11,7 @@ ; EG: LSHR {{\*?}} [[ADDR]] ; Works with the align 2 removed -define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { +define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { %c = add <2 x i32> %a, %b %x = shl <2 x i32> %c, %y = ashr <2 x i32> %x, diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll index 4c58261709c4..b702e1c07200 100644 --- a/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,8 +1,10 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: i16 promotion pass ruins the scalar cases when legal. +; FIXME: r600 fails verifier ; FUNC-LABEL: {{^}}sext_in_reg_i1_i32: ; GCN: s_load_dword [[ARG:s[0-9]+]], @@ -13,7 +15,7 @@ ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] ; EG: LSHR * [[ADDR]] ; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1 -define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 { %shl = shl i32 %in, 31 %sext = ashr i32 %shl, 31 store i32 %sext, i32 addrspace(1)* %out @@ -30,7 +32,7 @@ define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 { ; EG: ADD_INT ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal ; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %c = add i32 %a, %b ; add to prevent folding into extload %shl = shl i32 %c, 24 %ashr = ashr i32 %shl, 24 @@ -48,7 +50,7 @@ define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; EG: ADD_INT ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal ; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %c = add i32 %a, %b ; add to prevent folding into extload %shl = shl i32 %c, 16 %ashr = ashr i32 %shl, 16 @@ -66,7 +68,7 @@ define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; EG: ADD_INT ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal ; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { %c = add <1 x i32> %a, %b ; add to prevent folding into extload %shl = shl <1 x i32> %c, %ashr = ashr <1 x i32> %shl, @@ -80,7 +82,7 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %c = shl i64 %a, %b %shl = shl i64 %c, 63 %ashr = ashr i64 %shl, 63 @@ -94,7 +96,7 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %c = shl i64 %a, %b %shl = shl i64 %c, 56 %ashr = ashr i64 %shl, 56 @@ -109,7 +111,7 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %c = shl i64 %a, %b %shl = shl i64 %c, 48 %ashr = ashr i64 %shl, 48 @@ -123,7 +125,7 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %c = shl i64 %a, %b %shl = shl i64 %c, 32 %ashr = ashr i64 %shl, 32 @@ -138,7 +140,7 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { ; XGCN: buffer_store_dword ; XEG: BFE_INT ; XEG: ASHR -; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 { +; define amdgpu_kernel void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 { ; %c = add <1 x i64> %a, %b ; %shl = shl <1 x i64> %c, ; %ashr = ashr <1 x i64> %shl, @@ -150,15 +152,15 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -177,15 +179,15 @@ define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -204,15 +206,15 @@ define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -231,12 +233,12 @@ define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, ; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}} -define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}} +define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -262,7 +264,7 @@ define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; EG: LSHL ; EG: ASHR [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %c = add i32 %a, %b %x = shl i32 %c, 6 %y = ashr i32 %x, 7 @@ -285,7 +287,7 @@ define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, ; EG: LSHL ; EG: ASHR [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %c = add <2 x i32> %a, %b %x = shl <2 x i32> %c, %y = ashr <2 x i32> %x, @@ -303,7 +305,7 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %c = add <2 x i32> %a, %b ; add to prevent folding into extload %shl = shl <2 x i32> %c, %ashr = ashr <2 x i32> %shl, @@ -324,7 +326,7 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { %c = add <4 x i32> %a, %b ; add to prevent folding into extload %shl = shl <4 x i32> %c, %ashr = ashr <4 x i32> %shl, @@ -341,7 +343,7 @@ define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> % ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %c = add <2 x i32> %a, %b ; add to prevent folding into extload %shl = shl <2 x i32> %c, %ashr = ashr <2 x i32> %shl, @@ -362,7 +364,7 @@ define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { %c = add <4 x i32> %a, %b ; add to prevent folding into extload %shl = shl <4 x i32> %c, %ashr = ashr <4 x i32> %shl, @@ -379,7 +381,7 @@ define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> % ; EG: BFE_INT [[RES]] ; EG: BFE_INT [[RES]] ; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { +define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %c = add <2 x i32> %a, %b ; add to prevent folding into extload %shl = shl <2 x i32> %c, %ashr = ashr <2 x i32> %shl, @@ -388,7 +390,7 @@ define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> } ; FUNC-LABEL: {{^}}testcase: -define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 { +define amdgpu_kernel void @testcase(i8 addrspace(1)* %out, i8 %a) #0 { %and_a_1 = and i8 %a, 1 %cmp_eq = icmp eq i8 %and_a_1, 0 %cmp_slt = icmp slt i8 %a, 0 @@ -400,7 +402,7 @@ define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 { } ; FUNC-LABEL: {{^}}testcase_3: -define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 { +define amdgpu_kernel void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 { %and_a_1 = and i8 %a, 1 %cmp_eq = icmp eq i8 %and_a_1, 0 %cmp_slt = icmp slt i8 %a, 0 @@ -416,7 +418,7 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 { ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 { +define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 { %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload @@ -429,7 +431,7 @@ define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32: ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 -define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 { +define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 { %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload @@ -444,7 +446,7 @@ define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x ; GCN: v_max_i32 ; GCN-NOT: bfe ; GCN: buffer_store_short -define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 { +define amdgpu_kernel void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 { %tmp5 = load i8, i8 addrspace(1)* %src, align 1 %tmp2 = sext i8 %tmp5 to i32 %tmp2.5 = icmp sgt i32 %tmp2, 0 @@ -455,167 +457,22 @@ define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 ad ret void } -declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfe_0_width: -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm -define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_8_bfe_8: -; GCN: v_bfe_i32 -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm -define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone - %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone - store i32 %bfe1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_8_bfe_16: -; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 -; GCN: s_endpgm -define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone - %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone - store i32 %bfe1, i32 addrspace(1)* %out, align 4 - ret void -} - -; This really should be folded into 1 -; FUNC-LABEL: {{^}}bfe_16_bfe_8: -; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm -define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone - %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone - store i32 %bfe1, i32 addrspace(1)* %out, align 4 - ret void -} - -; Make sure there isn't a redundant BFE -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe: -; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}} -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm -define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %c = add i32 %a, %b ; add to prevent folding into extload - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: -define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %c = add i32 %a, %b ; add to prevent folding into extload - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe: -; GCN: buffer_load_sbyte -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm -define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { - %load = load i8, i8 addrspace(1)* %ptr, align 1 - %sext = sext i8 %load to i32 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; GCN: .text -; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}} -; GCN-NOT: {{[^@]}}bfe -; GCN: s_endpgm -define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { - %load = load i8, i8 addrspace(1)* %ptr, align 1 - %sext = sext i8 %load to i32 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0: -; GCN-NOT: shr -; GCN-NOT: shl -; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 -; GCN: s_endpgm -define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %shr = ashr i32 %shl, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1: -; GCN: buffer_load_dword -; GCN-NOT: shl -; GCN-NOT: shr -; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 -; GCN: s_endpgm -define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 30 - %shr = ashr i32 %shl, 30 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1: -; GCN: buffer_load_dword -; GCN-NOT: v_lshl -; GCN-NOT: v_ashr -; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2 -; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 -; GCN: s_endpgm -define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 30 - %shr = ashr i32 %shl, 30 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - ; Make sure we propagate the VALUness to users of a moved scalar BFE. ; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64_move_use: ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; GCN-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]] ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -636,16 +493,16 @@ define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrsp ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, ; GCN-DAG: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]] ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -668,10 +525,10 @@ define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrs ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 -define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 +define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { %ld = load i32, i32 addrspace(2)* %ptr %in = trunc i32 %ld to i16 %shl = shl i16 %in, 15 @@ -687,10 +544,10 @@ define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 -define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 +define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { %ld = load i32, i32 addrspace(2)* %ptr %in = trunc i32 %ld to i16 %shl = shl i16 %in, 14 @@ -704,7 +561,7 @@ define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[VAL]], 0, 1{{$}} ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]] -define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %gep = getelementptr i16, i16 addrspace(1)* %ptr, i32 %tid %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid @@ -721,11 +578,11 @@ define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr ; GCN: {{buffer|flat}}_load_ushort [[VAL1:v[0-9]+]] ; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] -; VI: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] +; GFX89: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}} ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]] -define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind { +define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %b.gep = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid @@ -748,10 +605,10 @@ define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace( ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} -define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} +define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { %shl = shl i16 %in, 14 %sext = ashr i16 %shl, 14 store i16 %sext, i16 addrspace(1)* %out @@ -765,10 +622,10 @@ define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { ; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} -define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} +define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { %shl = shl i16 %in, 8 %sext = ashr i16 %shl, 8 store i16 %sext, i16 addrspace(1)* %out @@ -782,16 +639,82 @@ define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} -define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} +define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { %shl = shl i16 %in, 1 %sext = ashr i16 %shl, 1 store i16 %sext, i16 addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i16: +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 15, [[ADD]] +; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 15, [[SHL]] +define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { + %c = add <2 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i16> %c, + %ashr = ashr <2 x i16> %shl, + store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16: +; GFX9: v_pk_add_u16 +; GFX9: v_pk_add_u16 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} +define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { + %c = add <3 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <3 x i16> %c, + %ashr = ashr <3 x i16> %shl, + store <3 x i16> %ashr, <3 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i2_to_v2i16: +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 14, [[ADD]] +; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 14, [[SHL]] +define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { + %c = add <2 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i16> %c, + %ashr = ashr <2 x i16> %shl, + store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i16: +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 8, [[ADD]] +; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 8, [[SHL]] +define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { + %c = add <2 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i16> %c, + %ashr = ashr <2 x i16> %shl, + store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16: +; GFX9: v_pk_add_u16 +; GFX9: v_pk_add_u16 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} +define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { + %c = add <3 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <3 x i16> %c, + %ashr = ashr <3 x i16> %shl, + store <3 x i16> %ashr, <3 x i16> addrspace(1)* %out + ret void +} + declare i32 @llvm.r600.read.tidig.x() #1 attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll index d5d2f6b717f9..8e18ab5554e4 100644 --- a/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -13,7 +13,7 @@ ; SI: s_sub -define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else @@ -52,7 +52,7 @@ endif: ; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]] ; SI: buffer_store_dword ; SI-NEXT: s_endpgm -define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else @@ -79,7 +79,7 @@ endif: ; SI: s_add_i32 [[SGPR:s[0-9]+]] ; SI-NOT: s_add_i32 [[SGPR]] -define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { +define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %tid_f = uitofp i32 %tid to float @@ -116,7 +116,7 @@ endif: ; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]] ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]] ; SI: buffer_store_dword [[RESULT]] -define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp1 = icmp eq i32 %tid, 0 diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll index f44ae6e09e9f..fb0bbaa9cbf2 100644 --- a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll +++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll @@ -6,7 +6,7 @@ ; SI-LABEL: {{^}}test_dup_operands: ; SI: v_add_i32_e32 -define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) { +define amdgpu_kernel void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) { %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %lo = extractelement <2 x i32> %a, i32 0 %hi = extractelement <2 x i32> %a, i32 1 diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll index 013f5253b369..5c20e9a8d585 100644 --- a/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -1,13 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -; This test checks that no VGPR to SGPR copies are created by the register -; allocator. - - -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - - ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] @@ -29,13 +22,13 @@ ELSE: ; preds = %main_body ENDIF: ; preds = %ELSE, %main_body %temp.0 = phi float [ %tmp26, %ELSE ], [ %tmp21, %main_body ] %tmp27 = fadd float %temp.0, %tmp23 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0 ret void } ; Make sure this program doesn't crash ; CHECK-LABEL: {{^}}phi2: -define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -58,28 +51,54 @@ main_body: %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0 %tmp38 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0 %tmp39 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp38, !tbaa !0 - %tmp40 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5) - %tmp41 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5) - %tmp42 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg3, <2 x i32> %arg5) - %tmp43 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg3, <2 x i32> %arg5) - %tmp44 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg3, <2 x i32> %arg5) - %tmp45 = bitcast float %tmp40 to i32 - %tmp46 = bitcast float %tmp41 to i32 + %i.i = extractelement <2 x i32> %arg5, i32 0 + %j.i = extractelement <2 x i32> %arg5, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #1 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #1 + %i.i19 = extractelement <2 x i32> %arg5, i32 0 + %j.i20 = extractelement <2 x i32> %arg5, i32 1 + %i.f.i21 = bitcast i32 %i.i19 to float + %j.f.i22 = bitcast i32 %j.i20 to float + %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 1, i32 0, i32 %arg3) #1 + %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 1, i32 0, i32 %arg3) #1 + %i.i13 = extractelement <2 x i32> %arg5, i32 0 + %j.i14 = extractelement <2 x i32> %arg5, i32 1 + %i.f.i15 = bitcast i32 %i.i13 to float + %j.f.i16 = bitcast i32 %j.i14 to float + %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 1, i32 %arg3) #1 + %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 1, i32 %arg3) #1 + %i.i7 = extractelement <2 x i32> %arg5, i32 0 + %j.i8 = extractelement <2 x i32> %arg5, i32 1 + %i.f.i9 = bitcast i32 %i.i7 to float + %j.f.i10 = bitcast i32 %j.i8 to float + %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 1, i32 %arg3) #1 + %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 1, i32 %arg3) #1 + %i.i1 = extractelement <2 x i32> %arg5, i32 0 + %j.i2 = extractelement <2 x i32> %arg5, i32 1 + %i.f.i3 = bitcast i32 %i.i1 to float + %j.f.i4 = bitcast i32 %j.i2 to float + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 1, i32 %arg3) #1 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 1, i32 %arg3) #1 + %tmp45 = bitcast float %p2.i to i32 + %tmp46 = bitcast float %p2.i24 to i32 %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0 %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1 %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32> - %tmp49 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp48, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp50 = extractelement <4 x float> %tmp49, i32 2 - %tmp51 = call float @fabs(float %tmp50) - %tmp52 = fmul float %tmp42, %tmp42 - %tmp53 = fmul float %tmp43, %tmp43 + %a.bc.i = bitcast <2 x i32> %tmp48 to <2 x float> + %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp50 = extractelement <4 x float> %tmp1, i32 2 + %tmp51 = call float @llvm.fabs.f32(float %tmp50) + %tmp52 = fmul float %p2.i18, %p2.i18 + %tmp53 = fmul float %p2.i12, %p2.i12 %tmp54 = fadd float %tmp53, %tmp52 - %tmp55 = fmul float %tmp44, %tmp44 + %tmp55 = fmul float %p2.i6, %p2.i6 %tmp56 = fadd float %tmp54, %tmp55 %tmp57 = call float @llvm.amdgcn.rsq.f32(float %tmp56) - %tmp58 = fmul float %tmp42, %tmp57 - %tmp59 = fmul float %tmp43, %tmp57 - %tmp60 = fmul float %tmp44, %tmp57 + %tmp58 = fmul float %p2.i18, %tmp57 + %tmp59 = fmul float %p2.i12, %tmp57 + %tmp60 = fmul float %p2.i6, %tmp57 %tmp61 = fmul float %tmp58, %tmp22 %tmp62 = fmul float %tmp59, %tmp23 %tmp63 = fadd float %tmp62, %tmp61 @@ -90,7 +109,7 @@ main_body: %tmp68 = fadd float %tmp67, %tmp66 %tmp69 = fmul float %tmp26, %tmp68 %tmp70 = fmul float %tmp27, %tmp68 - %tmp71 = call float @fabs(float %tmp69) + %tmp71 = call float @llvm.fabs.f32(float %tmp69) %tmp72 = fcmp olt float 0x3EE4F8B580000000, %tmp71 %tmp73 = sext i1 %tmp72 to i32 %tmp74 = bitcast i32 %tmp73 to float @@ -110,7 +129,7 @@ IF: ; preds = %main_body ENDIF: ; preds = %IF, %main_body %temp4.0 = phi float [ %tmp83, %IF ], [ %tmp31, %main_body ] - %tmp84 = call float @fabs(float %tmp70) + %tmp84 = call float @llvm.fabs.f32(float %tmp70) %tmp85 = fcmp olt float 0x3EE4F8B580000000, %tmp84 %tmp86 = sext i1 %tmp85 to i32 %tmp87 = bitcast i32 %tmp86 to float @@ -146,11 +165,9 @@ ENDIF24: ; preds = %IF25, %ENDIF %tmp110 = fmul float %tmp109, %tmp106 %tmp111 = fsub float -0.000000e+00, %tmp105 %tmp112 = fmul float %tmp111, %tmp106 - %tmp113 = call i32 @llvm.SI.packf16(float %tmp108, float %tmp110) - %tmp114 = bitcast i32 %tmp113 to float - %tmp115 = call i32 @llvm.SI.packf16(float %tmp112, float 1.000000e+00) - %tmp116 = bitcast i32 %tmp115 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp114, float %tmp116, float %tmp114, float %tmp116) + %tmp113 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp108, float %tmp110) + %tmp115 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp112, float 1.000000e+00) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp113, <2 x half> %tmp115, i1 true, i1 true) #0 ret void } @@ -183,7 +200,7 @@ LOOP: ; preds = %ENDIF, %main_body br i1 %tmp33, label %IF, label %ENDIF IF: ; preds = %LOOP - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00, i1 true, i1 true) #0 ret void ENDIF: ; preds = %LOOP @@ -193,31 +210,6 @@ ENDIF: ; preds = %LOOP br label %LOOP } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: readonly -declare float @fabs(float) #2 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <8 x i32>, <16 x i8>, i32) #1 - -; Function Attrs: readnone -declare float @llvm.amdgcn.rsq.f32(float) #1 - -declare float @llvm.exp2.f32(float) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.pow.f32(float, float) #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - ; This checks for a bug in the FixSGPRCopies pass where VReg96 ; registers were being identified as an SGPR regclass which was causing ; an assertion failure. @@ -248,24 +240,24 @@ entry: br i1 %tmp27, label %if, label %else if: ; preds = %entry - %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %val.if.0 = extractelement <4 x float> %val.if, i32 0 - %val.if.1 = extractelement <4 x float> %val.if, i32 1 - %val.if.2 = extractelement <4 x float> %val.if, i32 2 + %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %val.if.0 = extractelement <4 x float> %tmp1, i32 0 + %val.if.1 = extractelement <4 x float> %tmp1, i32 1 + %val.if.2 = extractelement <4 x float> %tmp1, i32 2 br label %endif else: ; preds = %entry - %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %val.else.0 = extractelement <4 x float> %val.else, i32 0 - %val.else.1 = extractelement <4 x float> %val.else, i32 1 - %val.else.2 = extractelement <4 x float> %val.else, i32 2 + %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %val.else.0 = extractelement <4 x float> %tmp2, i32 0 + %val.else.1 = extractelement <4 x float> %tmp2, i32 1 + %val.else.2 = extractelement <4 x float> %tmp2, i32 2 br label %endif endif: ; preds = %else, %if %val.0 = phi float [ %val.if.0, %if ], [ %val.else.0, %else ] %val.1 = phi float [ %val.if.1, %if ], [ %val.else.1, %else ] %val.2 = phi float [ %val.if.2, %if ], [ %val.else.2, %else ] - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %val.0, float %val.1, float %val.2, float 0.000000e+00, i1 true, i1 true) #0 ret void } @@ -273,7 +265,7 @@ endif: ; preds = %else, %if ; CHECK: buffer_load_dword ; CHECK: v_add ; CHECK: s_endpgm -define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) { +define amdgpu_kernel void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) { entry: %tmp = load float, float addrspace(1)* %in0 %tmp1 = fcmp oeq float %tmp, 0.000000e+00 @@ -312,7 +304,7 @@ LOOP68: ; preds = %ENDIF69, %entry IF70: ; preds = %LOOP68 %q = icmp ne i32 %l, 13 %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0 ret void ENDIF69: ; preds = %LOOP68 @@ -337,41 +329,53 @@ ENDIF69: ; preds = %LOOP68 define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { bb: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0 - %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !2 + %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !3 %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16) %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0 - %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !2 + %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !3 %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0 - %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !2 - %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7) - %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7) + %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !3 + %i.i = extractelement <2 x i32> %arg7, i32 0 + %j.i = extractelement <2 x i32> %arg7, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #0 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #0 + %i.i1 = extractelement <2 x i32> %arg7, i32 0 + %j.i2 = extractelement <2 x i32> %arg7, i32 1 + %i.f.i3 = bitcast i32 %i.i1 to float + %j.f.i4 = bitcast i32 %j.i2 to float + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #0 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #0 %tmp31 = bitcast float %tmp23 to i32 %tmp36 = icmp ne i32 %tmp31, 0 br i1 %tmp36, label %bb38, label %bb80 bb38: ; preds = %bb - %tmp52 = bitcast float %tmp29 to i32 - %tmp53 = bitcast float %tmp30 to i32 + %tmp52 = bitcast float %p2.i to i32 + %tmp53 = bitcast float %p2.i6 to i32 %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0 %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1 %tmp56 = bitcast <8 x i32> %tmp26 to <8 x i32> - %tmp58 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp55, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %a.bc.i = bitcast <2 x i32> %tmp55 to <2 x float> + %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) br label %bb71 bb80: ; preds = %bb - %tmp81 = bitcast float %tmp29 to i32 - %tmp82 = bitcast float %tmp30 to i32 + %tmp81 = bitcast float %p2.i to i32 + %tmp82 = bitcast float %p2.i6 to i32 %tmp82.2 = add i32 %tmp82, 1 %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0 %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1 %tmp85 = bitcast <8 x i32> %tmp26 to <8 x i32> - %tmp87 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp84, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %a.bc.i1 = bitcast <2 x i32> %tmp84 to <2 x float> + %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) br label %bb71 bb71: ; preds = %bb80, %bb38 - %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ] + %tmp72 = phi <4 x float> [ %tmp2, %bb38 ], [ %tmp3, %bb80 ] %tmp88 = extractelement <4 x float> %tmp72, i32 0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp88, float %tmp88, float %tmp88, float %tmp88, i1 true, i1 true) #0 ret void } @@ -379,14 +383,14 @@ bb71: ; preds = %bb80, %bb38 ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr: ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { +bb: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 - %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp10 = extractelement <4 x float> %tmp9, i32 0 - %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10) - %tmp13 = bitcast i32 %tmp12 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef) + %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp10 = extractelement <4 x float> %tmp, i32 0 + %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void } @@ -394,24 +398,35 @@ define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr: ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 { +bb: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0 - %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp10 = extractelement <4 x float> %tmp9, i32 0 - %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef) - %tmp13 = bitcast i32 %tmp12 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef) + %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> undef, <4 x i32> %tmp8, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp10 = extractelement <4 x float> %tmp, i32 0 + %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void } +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.amdgcn.rsq.f32(float) #1 +declare float @llvm.exp2.f32(float) #1 +declare float @llvm.pow.f32(float, float) #1 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} -!1 = !{!"const", !3} -!2 = !{!1, !1, i64 0} -!3 = !{!"tbaa root"} +!1 = !{!"const", !2} +!2 = !{!"tbaa root"} +!3 = !{!1, !1, i64 0} diff --git a/test/CodeGen/AMDGPU/sgprcopies.ll b/test/CodeGen/AMDGPU/sgprcopies.ll new file mode 100644 index 000000000000..68cd83bb6cf0 --- /dev/null +++ b/test/CodeGen/AMDGPU/sgprcopies.ll @@ -0,0 +1,58 @@ +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}checkTwoBlocksWithUniformBranch +; GCN: BB0_2 +; GCN: v_add +define amdgpu_kernel void @checkTwoBlocksWithUniformBranch(i32 addrspace(1)* nocapture %out, i32 %width, float %xPos, float %yPos, float %xStep, float %yStep, i32 %maxIter) { +entry: + %conv = call i32 @llvm.amdgcn.workitem.id.x() #1 + %rem = urem i32 %conv, %width + %div = udiv i32 %conv, %width + %conv1 = sitofp i32 %rem to float + %x = tail call float @llvm.fmuladd.f32(float %xStep, float %conv1, float %xPos) + %conv2 = sitofp i32 %div to float + %y = tail call float @llvm.fmuladd.f32(float %yStep, float %conv2, float %yPos) + %yy = fmul float %y, %y + %xy = tail call float @llvm.fmuladd.f32(float %x, float %x, float %yy) + %cmp01 = fcmp ole float %xy, 4.000000e+00 + %cmp02 = icmp ne i32 %maxIter, 0 + %cond01 = and i1 %cmp02, %cmp01 + br i1 %cond01, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %x_val = phi float [ %call8, %for.body ], [ %x, %for.body.preheader ] + %iter_val = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %y_val = phi float [ %call9, %for.body ], [ %y, %for.body.preheader ] + %sub = fsub float -0.000000e+00, %y_val + %call7 = tail call float @llvm.fmuladd.f32(float %x_val, float %x_val, float %x) #1 + %call8 = tail call float @llvm.fmuladd.f32(float %sub, float %y_val, float %call7) #1 + %mul = fmul float %x_val, 2.000000e+00 + %call9 = tail call float @llvm.fmuladd.f32(float %mul, float %y_val, float %y) #1 + %inc = add nuw i32 %iter_val, 1 + %mul3 = fmul float %call9, %call9 + %0 = tail call float @llvm.fmuladd.f32(float %call8, float %call8, float %mul3) + %cmp = fcmp ole float %0, 4.000000e+00 + %cmp5 = icmp ult i32 %inc, %maxIter + %or.cond = and i1 %cmp5, %cmp + br i1 %or.cond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %iter.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %for.end.loopexit ] + %idxprom = ashr exact i32 %conv, 32 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idxprom + store i32 %iter.0.lcssa, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare float @llvm.fmuladd.f32(float, float, float) #1 + +attributes #0 = { nounwind readnone } +attributes #1 = { readnone } diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index 48bbc32abcbb..0a29db4a0580 100644 --- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -11,8 +11,8 @@ ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x %ld.64 = load i128, i128 addrspace(1)* %in.gep @@ -33,7 +33,7 @@ define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x @@ -55,8 +55,8 @@ define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x %ld.64 = load i128, i128 addrspace(1)* %in.gep @@ -77,7 +77,7 @@ define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x @@ -100,7 +100,7 @@ define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm -define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x @@ -113,5 +113,7 @@ define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace( declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index b85714ea54c1..6f5fc6d0f38c 100644 --- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -8,8 +8,8 @@ ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -25,7 +25,7 @@ define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -41,8 +41,8 @@ define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -57,8 +57,8 @@ define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -73,7 +73,7 @@ define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -89,7 +89,7 @@ define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -105,8 +105,8 @@ define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -121,8 +121,8 @@ define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -137,8 +137,8 @@ define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -155,8 +155,8 @@ define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -171,7 +171,7 @@ define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -188,7 +188,7 @@ define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -204,7 +204,7 @@ define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 ; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -220,7 +220,7 @@ define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31 ; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} -define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -236,7 +236,7 @@ define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] ; GCN: buffer_store_dword v[[SHIFT]] -define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -252,7 +252,7 @@ define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspa ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}} ; GCN: buffer_store_dword [[BFE]] -define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -268,7 +268,7 @@ define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspac ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}} ; GCN: buffer_store_dword [[BFE]] -define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -286,7 +286,7 @@ define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspa ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]] ; GCN-NOT: v[[SHRLO]] ; GCN: buffer_store_dword v[[SHRLO]] -define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -306,7 +306,7 @@ define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addr ; GCN-NOT: v[[SHRLO]] ; GCN-NOT: v[[SHRHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} -define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -327,7 +327,7 @@ define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} -define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -347,7 +347,7 @@ define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i6 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}} -define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x @@ -365,7 +365,7 @@ define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i6 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} ; GCN: buffer_store_dword v[[ZERO]] -define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { +define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x @@ -383,5 +383,7 @@ define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %ou declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll index 28a7b924904d..a803849be02c 100644 --- a/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -8,7 +8,7 @@ ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = lshr i64 %val, 35 store i64 %shl, i64 addrspace(1)* %out @@ -20,7 +20,7 @@ define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = lshr i64 %val, 63 store i64 %shl, i64 addrspace(1)* %out @@ -32,7 +32,7 @@ define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = lshr i64 %val, 33 store i64 %shl, i64 addrspace(1)* %out @@ -43,7 +43,7 @@ define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = lshr i64 %val, 32 store i64 %shl, i64 addrspace(1)* %out @@ -58,7 +58,7 @@ define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} -define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff %shl = lshr i64 %and, 40 @@ -73,7 +73,7 @@ define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]] ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 35 store i64 %shl, i64 addrspace(1)* %out @@ -84,7 +84,7 @@ define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 32 store i64 %shl, i64 addrspace(1)* %out @@ -96,7 +96,7 @@ define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]] ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 63 store i64 %shl, i64 addrspace(1)* %out @@ -106,7 +106,7 @@ define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x) ; GCN-LABEL: {{^}}ashr_i64_const_32: -define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = ashr i64 %val, 32 store i64 %shl, i64 addrspace(1)* %out @@ -114,7 +114,7 @@ define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { } ; GCN-LABEL: {{^}}ashr_i64_const_63: -define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = ashr i64 %val, 63 store i64 %shl, i64 addrspace(1)* %out @@ -125,7 +125,7 @@ define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]] ; GCN: buffer_store_dword [[SHL]] -define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 31 %trunc = trunc i64 %shl to i32 @@ -137,7 +137,7 @@ define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]] ; GCN: buffer_store_short [[SHL]] -define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 15 %trunc = trunc i64 %shl to i16 @@ -149,7 +149,7 @@ define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]] ; GCN: buffer_store_short [[SHL]] -define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) { %val = load i32, i32 addrspace(1)* %in %shl = shl i32 %val, 15 %trunc = trunc i32 %shl to i16 @@ -161,7 +161,7 @@ define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]] ; GCN: buffer_store_byte [[SHL]] -define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 7 %trunc = trunc i64 %shl to i8 @@ -174,7 +174,7 @@ define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]] ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]] ; GCN: buffer_store_byte [[AND]] -define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 1 %trunc = trunc i64 %shl to i2 @@ -186,7 +186,7 @@ define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]] ; GCN: buffer_store_dword [[SHL]] -define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 1 %trunc = trunc i64 %shl to i32 @@ -198,7 +198,7 @@ define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) ; GCN: buffer_load_dword [[VAL:v[0-9]+]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]] ; GCN: buffer_store_dword [[SHL]] -define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 16 %trunc = trunc i64 %shl to i32 @@ -209,7 +209,7 @@ define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) ; GCN-LABEL: {{^}}trunc_shl_33_i32_i64: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[ZERO]] -define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 33 %trunc = trunc i64 %shl to i32 @@ -222,7 +222,7 @@ define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) ; GCN-DAG: v_lshlrev_b32_e32 v[[RESHI:[0-9]+]], 16, v{{[0-9]+}} ; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}} -define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in %shl = shl <2 x i64> %val, %trunc = trunc <2 x i64> %shl to <2 x i32> @@ -235,7 +235,7 @@ define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> ad ; GCN: v_lshl_b64 v{{\[}}[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]{{\]}}, [[VAL]], 31 ; GCN: buffer_store_dword v[[RESLO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}} -define void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in %shl = shl i64 %val, 31 %trunc = trunc i64 %shl to i32 diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll index 972349c24453..ff666cc3653b 100644 --- a/test/CodeGen/AMDGPU/shl.ll +++ b/test/CodeGen/AMDGPU/shl.ll @@ -1,9 +1,11 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s -; XUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.tgid.x() #0 + ;EG: {{^}}shl_v2i32: ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} @@ -17,7 +19,7 @@ declare i32 @llvm.r600.read.tidig.x() #0 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -44,7 +46,7 @@ define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -57,7 +59,7 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %a = load i16, i16 addrspace(1)* %in %b = load i16, i16 addrspace(1)* %b_ptr @@ -70,7 +72,7 @@ define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { +define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { %a = load i16, i16 addrspace(1)* %in %result = shl i16 %a, %b store i16 %result, i16 addrspace(1)* %out @@ -81,7 +83,7 @@ define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { +define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { %a = load i16, i16 addrspace(1)* %in %b.add = add i16 %b, 3 %result = shl i16 %a, %b.add @@ -92,7 +94,7 @@ define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, ; GCN-LABEL: {{^}}shl_i16_computed_amount: ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 3, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, [[ADD]], v{{[0-9]+}} -define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid @@ -107,7 +109,7 @@ define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* % ; GCN-LABEL: {{^}}shl_i16_i_s: ; GCN: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 12 -define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) { +define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) { %result = shl i16 %a, 12 store i16 %result, i16 addrspace(1)* %out ret void @@ -116,7 +118,7 @@ define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) { ; GCN-LABEL: {{^}}shl_v2i16: ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -133,7 +135,7 @@ define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid @@ -160,7 +162,7 @@ define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in ; GCN-LABEL: {{^}}shl_i64: ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} ; VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %a = load i64, i64 addrspace(1)* %in %b = load i64, i64 addrspace(1)* %b_ptr @@ -199,7 +201,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr @@ -262,7 +264,7 @@ define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr @@ -277,7 +279,7 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { %result = shl i64 %a, 32 store i64 %result, i64 addrspace(1)* %out ret void @@ -287,8 +289,8 @@ define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { ; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]], ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}} -define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() #0 +define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tgid.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid %a = load i64, i64 addrspace(1)* %gep.in @@ -299,7 +301,7 @@ define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; FUNC-LABEL: {{^}}s_shl_constant_i64 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) { %shl = shl i64 281474976710655, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -311,7 +313,7 @@ define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) { ; SI-DAG: s_movk_i32 s[[KHI:[0-9]+]], 0x11e{{$}} ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]] ; SI: buffer_store_dwordx2 -define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %shl = shl i64 1231231234567, %a store i64 %shl, i64 addrspace(1)* %out, align 8 @@ -323,7 +325,7 @@ define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) ; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x12d687{{$}} ; SI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0{{$}} ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]] -define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %shl = shl i64 1234567, %a store i64 %shl, i64 addrspace(1)* %out, align 8 @@ -332,7 +334,7 @@ define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* ; FUNC-LABEL: {{^}}v_shl_inline_imm_64_i64: ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, 64, {{v[0-9]+}} -define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %shl = shl i64 64, %a store i64 %shl, i64 addrspace(1)* %out, align 8 @@ -341,7 +343,7 @@ define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; FUNC-LABEL: {{^}}s_shl_inline_imm_64_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 64, s{{[0-9]+}} -define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 64, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -349,7 +351,7 @@ define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; FUNC-LABEL: {{^}}s_shl_inline_imm_1_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1, s{{[0-9]+}} -define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 1, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -357,7 +359,7 @@ define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a ; FUNC-LABEL: {{^}}s_shl_inline_imm_1.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}} -define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4607182418800017408, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -365,7 +367,7 @@ define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_1.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}} -define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13830554455654793216, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -373,7 +375,7 @@ define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace( ; FUNC-LABEL: {{^}}s_shl_inline_imm_0.5_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0.5, s{{[0-9]+}} -define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4602678819172646912, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -381,7 +383,7 @@ define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_0.5_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -0.5, s{{[0-9]+}} -define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13826050856027422720, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -389,7 +391,7 @@ define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace( ; FUNC-LABEL: {{^}}s_shl_inline_imm_2.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 2.0, s{{[0-9]+}} -define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4611686018427387904, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -397,7 +399,7 @@ define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_2.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -2.0, s{{[0-9]+}} -define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13835058055282163712, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -405,7 +407,7 @@ define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace( ; FUNC-LABEL: {{^}}s_shl_inline_imm_4.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 4.0, s{{[0-9]+}} -define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4616189618054758400, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -413,7 +415,7 @@ define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_4.0_i64: ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -4.0, s{{[0-9]+}} -define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13839561654909534208, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -427,7 +429,7 @@ define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace( ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}} ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}} -define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 1082130432, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -439,7 +441,7 @@ define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace( ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}} ; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]] ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}, s{{[0-9]+}} -define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 -1065353216, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -450,7 +452,7 @@ define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}} -define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 4647714815446351872, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void @@ -460,10 +462,18 @@ define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrs ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}} -define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { +define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { %shl = shl i64 13871086852301127680, %a store i64 %shl, i64 addrspace(1)* %out, align 8 ret void } +; FUNC-LABEL: {{^}}test_mul2: +; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1 +define amdgpu_kernel void @test_mul2(i32 %p) { + %i = mul i32 %p, 2 + store volatile i32 %i, i32 addrspace(1)* undef + ret void +} + attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/shl.v2i16.ll b/test/CodeGen/AMDGPU/shl.v2i16.ll new file mode 100644 index 000000000000..eac29bad7cf2 --- /dev/null +++ b/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -0,0 +1,152 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s + +; GCN-LABEL: {{^}}s_shl_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] + +; VI: v_lshlrev_b32_e32 +; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +; CI: v_lshlrev_b32_e32 +; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_or_b32_e32 +define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { + %result = shl <2 x i16> %lhs, %rhs + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_shl_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] + +; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_lshlrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]] +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr + %result = shl <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_v_s_v2i16: +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = shl <2 x i16> %vgpr, %sgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_s_v_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = shl <2 x i16> %sgpr, %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_imm_v_v2i16: +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8 +define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = shl <2 x i16> , %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_v_imm_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]] +define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = shl <2 x i16> %vgpr, + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_shl_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr + %result = shl <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_v_imm_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %result = shl <4 x i16> %vgpr, + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll index 9b5f9fed4d79..9da4bc028016 100644 --- a/test/CodeGen/AMDGPU/shl_add_constant.ll +++ b/test/CodeGen/AMDGPU/shl_add_constant.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 36, [[REG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x %val = load i32, i32 addrspace(1)* %ptr, align 4 @@ -25,7 +25,7 @@ define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; SI-DAG: buffer_store_dword [[ADDREG]] ; SI-DAG: buffer_store_dword [[SHLREG]] ; SI: s_endpgm -define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x %val = load i32, i32 addrspace(1)* %ptr, align 4 @@ -43,7 +43,7 @@ define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xf9c, [[REG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x %val = load i32, i32 addrspace(1)* %ptr, align 4 @@ -61,7 +61,7 @@ define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 ; SI: s_addk_i32 [[RESULT]], 0x3d8 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] ; SI: buffer_store_dword [[VRESULT]] -define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %add.0 = add i32 %x, 123 %shl = shl i32 %add.0, 3 %add.1 = add i32 %shl, %y @@ -78,7 +78,7 @@ define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) # ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]] ; SI: buffer_store_dword [[VRESULT]] -define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %add.0 = add i32 %x, 123 %shl = shl i32 %add.0, 3 %add.1 = add i32 %y, %shl diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll index 6e45759fa058..9147eb58c6ad 100644 --- a/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -19,7 +19,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 ; SI: s_endpgm -define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -39,7 +39,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad ; SI-DAG: buffer_store_dword [[RESULT]] ; SI-DAG: buffer_store_dword [[ADDUSE]] ; SI: s_endpgm -define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -55,7 +55,7 @@ define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %ad ; SI-LABEL: {{^}}load_shl_base_lds_max_offset ; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535 ; SI: s_endpgm -define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 65535 %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0 @@ -73,7 +73,7 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3) ; SI: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 ; SI: s_endpgm -define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { +define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 64 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -89,7 +89,7 @@ define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -104,7 +104,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a @lds2 = addrspace(3) global [512 x i32] undef, align 4 -; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; define amdgpu_kernel void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 ; %idx.0 = add nsw i32 %tid.x, 2 ; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -119,7 +119,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { +define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -134,7 +134,7 @@ define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -148,7 +148,7 @@ define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1) ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -162,7 +162,7 @@ define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -176,7 +176,7 @@ define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -190,7 +190,7 @@ define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -204,7 +204,7 @@ define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -214,7 +214,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* ret void } -; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; define amdgpu_kernel void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 ; %idx.0 = add nsw i32 %tid.x, 2 ; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -228,7 +228,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -242,7 +242,7 @@ define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -256,7 +256,7 @@ define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 @@ -270,7 +270,7 @@ define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1) ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; SI: s_endpgm -define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 diff --git a/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll new file mode 100644 index 000000000000..14ca635c6dad --- /dev/null +++ b/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -0,0 +1,186 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; Test that add/sub with a constant is swapped to sub/add with negated +; constant to minimize code size. + +; GCN-LABEL: {{^}}v_test_i32_x_sub_64: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]] +define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load i32, i32 addrspace(1)* %gep + %result = sub i32 %x, 64 + store i32 %result, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}v_test_i32_x_sub_64_multi_use: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[Y:v[0-9]+]] +; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]] +; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]] +define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load volatile i32, i32 addrspace(1)* %gep + %y = load volatile i32, i32 addrspace(1)* %gep + %result0 = sub i32 %x, 64 + %result1 = sub i32 %y, 64 + store volatile i32 %result0, i32 addrspace(1)* %gep.out + store volatile i32 %result1, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}v_test_i32_64_sub_x: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]] +define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load i32, i32 addrspace(1)* %gep + %result = sub i32 64, %x + store i32 %result, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}v_test_i32_x_sub_65: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xffffffbf, [[X]] +define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load i32, i32 addrspace(1)* %gep + %result = sub i32 %x, 65 + store i32 %result, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}v_test_i32_65_sub_x: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x41, [[X]] +define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load i32, i32 addrspace(1)* %gep + %result = sub i32 65, %x + store i32 %result, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}v_test_i32_x_sub_neg16: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 16, [[X]] +define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load i32, i32 addrspace(1)* %gep + %result = sub i32 %x, -16 + store i32 %result, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}v_test_i32_neg16_sub_x: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, -16, [[X]] +define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load i32, i32 addrspace(1)* %gep + %result = sub i32 -16, %x + store i32 %result, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}v_test_i32_x_sub_neg17: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 17, [[X]] +define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load i32, i32 addrspace(1)* %gep + %result = sub i32 %x, -17 + store i32 %result, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}v_test_i32_neg17_sub_x: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0xffffffef, [[X]] +define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load i32, i32 addrspace(1)* %gep + %result = sub i32 -17, %x + store i32 %result, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}s_test_i32_x_sub_64: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN: s_sub_i32 s{{[0-9]+}}, [[X]], 64 +define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { + %result = sub i32 %x, 64 + call void asm sideeffect "; use $0", "s"(i32 %result) + ret void +} + +; GCN-LABEL: {{^}}v_test_i16_x_sub_64: +; VI: {{buffer|flat}}_load_ushort [[X:v[0-9]+]] +; VI: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]] +define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %x = load i16, i16 addrspace(1)* %gep + %result = sub i16 %x, 64 + store i16 %result, i16 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}v_test_i16_x_sub_64_multi_use: +; GCN: {{buffer|flat}}_load_ushort [[X:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[Y:v[0-9]+]] +; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]] +; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[Y]] + +; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]] +; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]] +define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %x = load volatile i16, i16 addrspace(1)* %gep + %y = load volatile i16, i16 addrspace(1)* %gep + %result0 = sub i16 %x, 64 + %result1 = sub i16 %y, 64 + store volatile i16 %result0, i16 addrspace(1)* %gep.out + store volatile i16 %result1, i16 addrspace(1)* %gep.out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir index 1988a14b5845..6248d8a46daf 100644 --- a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -7,7 +7,7 @@ # resume crashes --- | - define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -20,7 +20,7 @@ ret void } - define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -33,7 +33,7 @@ ret void } - define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -46,7 +46,7 @@ ret void } - define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -59,7 +59,7 @@ ret void } - define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -72,7 +72,7 @@ ret void } - define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll index ef616eb63801..5c6663dbbdab 100644 --- a/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll +++ b/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll @@ -6,10 +6,10 @@ ; OPT-NOT: call i1 @llvm.amdgcn.loop ; GCN-LABEL: {{^}}annotate_unreachable_noloop: -; GCN: s_cbranch_vccnz +; GCN: s_cbranch_scc1 ; GCN-NOT: s_endpgm ; GCN: .Lfunc_end0 -define void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { +define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() br label %bb1 @@ -37,12 +37,49 @@ bb5: ; preds = %bb3, %bb1 ; OPT-NOT: call i1 @llvm.amdgcn.loop ; GCN-LABEL: {{^}}annotate_ret_noloop: +; GCN: load_dwordx4 +; GCN: v_cmp_nlt_f32 +; GCN: s_and_saveexec_b64 +; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]] +; GCN-NEXT: [[UNIFIED_RET]]: +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: s_endpgm +; GCN: .Lfunc_end +define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 + +bb1: ; preds = %bb + %tmp2 = sext i32 %tmp to i64 + %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2 + %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16 + %tmp5 = extractelement <4 x float> %tmp4, i32 1 + store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef + %cmp = fcmp ogt float %tmp5, 1.0 + br i1 %cmp, label %bb5, label %bb3 + +bb3: ; preds = %bb1 + %tmp6 = extractelement <4 x float> %tmp4, i32 2 + %tmp7 = fcmp olt float %tmp6, 0.000000e+00 + br i1 %tmp7, label %bb4, label %bb5 ; crash goes away if these are swapped + +bb4: ; preds = %bb3 + ret void + +bb5: ; preds = %bb3, %bb1 + ret void +} + +; OPT-LABEL: @uniform_annotate_ret_noloop( +; OPT-NOT: call i1 @llvm.amdgcn.loop + +; GCN-LABEL: {{^}}uniform_annotate_ret_noloop: ; GCN: s_cbranch_scc1 ; GCN: s_endpgm -; GCN: .Lfunc_end1 -define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { +; GCN: .Lfunc_end +define amdgpu_kernel void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 { bb: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() br label %bb1 bb1: ; preds = %bb diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll b/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll new file mode 100644 index 000000000000..e50c595bc6c3 --- /dev/null +++ b/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll @@ -0,0 +1,40 @@ +; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + + +; OPT-LABEL: @annotate_unreachable( +; OPT: call { i1, i64 } @llvm.amdgcn.if( +; OPT-NOT: call void @llvm.amdgcn.end.cf( + + +; GCN-LABEL: {{^}}annotate_unreachable: +; GCN: s_and_saveexec_b64 +; GCN-NOT: s_endpgm +; GCN: .Lfunc_end0 +define amdgpu_kernel void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 + +bb1: ; preds = %bb + %tmp2 = sext i32 %tmp to i64 + %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2 + %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16 + br i1 undef, label %bb3, label %bb5 ; label order reversed + +bb3: ; preds = %bb1 + %tmp6 = extractelement <4 x float> %tmp4, i32 2 + %tmp7 = fcmp olt float %tmp6, 0.000000e+00 + br i1 %tmp7, label %bb4, label %bb5 + +bb4: ; preds = %bb3 + unreachable + +bb5: ; preds = %bb3, %bb1 + unreachable +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll index d658b229fd37..a4b6d1fd069d 100644 --- a/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -10,7 +10,7 @@ ; SI: s_andn2_b64 ; s_cbranch_execnz [[LOOP_LABEL]] ; SI: s_endpgm -define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) { main_body: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %0 = and i32 %a, %tid @@ -40,7 +40,7 @@ ENDIF: ; SI: s_cbranch_execnz [[LOOP_LABEL]] ; SI: s_endpgm -define void @phi_cond_outside_loop(i32 %b) { +define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { entry: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %0 = icmp eq i32 %tid , 0 @@ -68,7 +68,7 @@ exit: ; CHECK-LABEL: {{^}}switch_unreachable: ; CHECK-NOT: s_endpgm ; CHECK: .Lfunc_end2 -define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { +define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { centry: switch i32 %x, label %sw.default [ i32 0, label %sw.bb @@ -100,7 +100,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; SI: [[ENDPGM]]: ; SI: s_endpgm -define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { +define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { entry: %cmp = icmp sgt i32 %c0, 0 br label %while.cond.outer diff --git a/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll index 025a3d8fca2e..b0473f3b5bda 100644 --- a/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -6,7 +6,7 @@ ; CHECK s_or_b64 exec, exec ; CHECK s_andn2_b64 exec, exec ; CHECK s_cbranch_execnz -define void @test(i32 %arg, i32 %arg1) { +define amdgpu_kernel void @test(i32 %arg, i32 %arg1) { bb: %tmp = icmp ne i32 %arg, 0 %tmp7 = icmp ne i32 %arg1, 0 diff --git a/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir index 0c08deb13a8e..20052e865a54 100644 --- a/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir +++ b/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir @@ -1,7 +1,7 @@ # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies %s -o - | FileCheck %s -check-prefixes=GCN --- | - define void @phi_visit_order() { ret void } + define amdgpu_kernel void @phi_visit_order() { ret void } name: phi_visit_order tracksRegLiveness: true diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll index 0d1de6662f25..580268deb85d 100644 --- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll +++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll @@ -5,7 +5,7 @@ ; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load volatile i32, i32 addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/si-literal-folding.ll b/test/CodeGen/AMDGPU/si-literal-folding.ll deleted file mode 100644 index b3f000c8ccd2..000000000000 --- a/test/CodeGen/AMDGPU/si-literal-folding.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}main: -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}} -define amdgpu_vs void @main(float) { -main_body: - %1 = fmul float %0, 0x3FE86A7F00000000 - %2 = fmul float %0, 0xBFE86A7F00000000 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %1, float %1, float %2, float %2) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll index 8e846d7a238e..3a7359ea4ffa 100644 --- a/test/CodeGen/AMDGPU/si-lod-bias.ll +++ b/test/CodeGen/AMDGPU/si-lod-bias.ll @@ -1,12 +1,12 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; This shader has the potential to generated illegal VGPR to SGPR copies if ; the wrong register class is used for the REG_SEQUENCE instructions. -; CHECK: {{^}}main: -; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf -define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) { +; GCN-LABEL: {{^}}main: +; GCN: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf +define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -15,38 +15,45 @@ main_body: %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0 %tmp24 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0 %tmp25 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp24, !tbaa !0 - %tmp26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5) - %tmp27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5) + %i.i = extractelement <2 x i32> %arg5, i32 0 + %j.i = extractelement <2 x i32> %arg5, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #0 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #0 + %i.i1 = extractelement <2 x i32> %arg5, i32 0 + %j.i2 = extractelement <2 x i32> %arg5, i32 1 + %i.f.i3 = bitcast i32 %i.i1 to float + %j.f.i4 = bitcast i32 %j.i2 to float + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg3) #0 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg3) #0 %tmp28 = bitcast float %tmp21 to i32 - %tmp29 = bitcast float %tmp26 to i32 - %tmp30 = bitcast float %tmp27 to i32 + %tmp29 = bitcast float %p2.i to i32 + %tmp30 = bitcast float %p2.i6 to i32 %tmp31 = insertelement <4 x i32> undef, i32 %tmp28, i32 0 %tmp32 = insertelement <4 x i32> %tmp31, i32 %tmp29, i32 1 %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2 %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3 %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32> - %tmp35 = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> %tmp34, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp34.bc = bitcast <4 x i32> %tmp34 to <4 x float> + %tmp35 = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> %tmp34.bc, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp36 = extractelement <4 x float> %tmp35, i32 0 %tmp37 = extractelement <4 x float> %tmp35, i32 1 %tmp38 = extractelement <4 x float> %tmp35, i32 2 %tmp39 = extractelement <4 x float> %tmp35, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp36, float %tmp37, float %tmp38, float %tmp39) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp36, float %tmp37, float %tmp38, float %tmp39, i1 true, i1 true) #0 ret void } -; Function Attrs: nounwind readnone +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - - +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll index 8d66df258e43..cb010cf15300 100644 --- a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -4,17 +4,18 @@ ; GCN: v_cmp_eq_u32 ; GCN: s_and_saveexec_b64 ; GCN: s_xor_b64 -; GCN: ; mask branch [[RET:BB[0-9]+]] -; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]] +; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]] -; GCN: [[RET]] -; GCN: s_or_b64 exec, exec -; GCN: s_endpgm - -; GCN: [[UNREACHABLE]]: +; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable ; GCN: ds_write_b32 +; GCN: ; divergent unreachable ; GCN: s_waitcnt -define void @lower_control_flow_unreachable_terminator() #0 { + +; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock +; GCN-NEXT: s_or_b64 exec, exec +; GCN: s_endpgm + +define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 { bb: %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp63 = icmp eq i32 %tmp15, 32 @@ -29,19 +30,20 @@ ret: } ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order: -; GCN: v_cmp_eq_u32 +; GCN: v_cmp_ne_u32 ; GCN: s_and_saveexec_b64 ; GCN: s_xor_b64 -; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]] +; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]] -; GCN-NEXT: ; %ret -; GCN-NEXT: s_endpgm - -; GCN-NEXT: [[UNREACHABLE]]: -; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable ; GCN: ds_write_b32 +; GCN: ; divergent unreachable ; GCN: s_waitcnt -define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 { + +; GCN: [[RETURN]]: +; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 { bb: %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp63 = icmp eq i32 %tmp15, 32 @@ -55,7 +57,29 @@ unreachable: unreachable } -; Function Attrs: nounwind readnone +; GCN-LABEL: {{^}}uniform_lower_control_flow_unreachable_terminator: +; GCN: s_cmp_lg_u32 +; GCN: s_cbranch_scc0 [[UNREACHABLE:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: BB#{{[0-9]+}}: ; %ret +; GCN-NEXT: s_endpgm + +; GCN: [[UNREACHABLE]]: +; GCN: ds_write_b32 +; GCN: s_waitcnt +define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 { +bb: + %tmp63 = icmp eq i32 %arg0, 32 + br i1 %tmp63, label %unreachable, label %ret + +unreachable: + store volatile i32 0, i32 addrspace(3)* undef, align 4 + unreachable + +ret: + ret void +} + declare i32 @llvm.amdgcn.workitem.id.y() #1 attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/si-scheduler.ll b/test/CodeGen/AMDGPU/si-scheduler.ll index 9374ef3cd907..462528c4ff1a 100644 --- a/test/CodeGen/AMDGPU/si-scheduler.ll +++ b/test/CodeGen/AMDGPU/si-scheduler.ll @@ -3,7 +3,7 @@ ; The only way the subtarget knows that the si machine scheduler is being used ; is to specify -mattr=si-scheduler. If we just pass --misched=si, the backend ; won't know what scheduler we are using. -; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s | FileCheck %s +; RUN: llc -march=amdgcn --misched=si -mattr=si-scheduler < %s | FileCheck %s ; The test checks the "si" machine scheduler pass works correctly. @@ -22,39 +22,46 @@ main_body: %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0 %tmp23 = bitcast [17 x <4 x i32>] addrspace(2)* %arg2 to <16 x i8> addrspace(2)* %tmp24 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp23, align 16, !tbaa !0 - %tmp25 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg11) - %tmp26 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg11) - %tmp27 = bitcast float %tmp25 to i32 - %tmp28 = bitcast float %tmp26 to i32 + %i.i = extractelement <2 x i32> %arg11, i32 0 + %j.i = extractelement <2 x i32> %arg11, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #1 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #1 + %i.i1 = extractelement <2 x i32> %arg11, i32 0 + %j.i2 = extractelement <2 x i32> %arg11, i32 1 + %i.f.i3 = bitcast i32 %i.i1 to float + %j.f.i4 = bitcast i32 %j.i2 to float + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #1 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #1 + %tmp27 = bitcast float %p2.i to i32 + %tmp28 = bitcast float %p2.i6 to i32 %tmp29 = insertelement <2 x i32> undef, i32 %tmp27, i32 0 %tmp30 = insertelement <2 x i32> %tmp29, i32 %tmp28, i32 1 %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32> %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32> - %tmp31 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp30, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp30.bc = bitcast <2 x i32> %tmp30 to <2 x float> + %tmp31 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp30.bc, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp32 = extractelement <4 x float> %tmp31, i32 0 %tmp33 = extractelement <4 x float> %tmp31, i32 1 %tmp34 = extractelement <4 x float> %tmp31, i32 2 %tmp35 = extractelement <4 x float> %tmp31, i32 3 - %tmp36 = call i32 @llvm.SI.packf16(float %tmp32, float %tmp33) - %tmp37 = bitcast i32 %tmp36 to float - %tmp38 = call i32 @llvm.SI.packf16(float %tmp34, float %tmp35) - %tmp39 = bitcast i32 %tmp38 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp37, float %tmp39, float %tmp37, float %tmp39) + %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp32, float %tmp33) + %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp34, float %tmp35) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 false) #0 ret void } -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll index e61b4051124a..8731e74d63a0 100644 --- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -1,27 +1,29 @@ -; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling,-mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; These tests check that the compiler won't crash when it needs to spill ; SGPRs. - @ddxy_lds = external addrspace(3) global [64 x i32] -; CHECK-LABEL: {{^}}main: -; CHECK: s_wqm +; GCN-LABEL: {{^}}main: +; GCN: s_wqm ; Make sure not emitting unused scratch resource descriptor setup -; CHECK-NOT: s_mov_b32 -; CHECK-NOT: s_mov_b32 -; CHECK-NOT: s_mov_b32 -; CHECK-NOT: s_mov_b32 +; GCN-NOT: s_mov_b32 +; GCN-NOT: s_mov_b32 +; GCN-NOT: s_mov_b32 +; GCN-NOT: s_mov_b32 -; CHECK: s_mov_b32 m0 +; GCN: s_mov_b32 m0 +; Make sure scratch space isn't being used for SGPR->VGPR spills ; Writing to M0 from an SMRD instruction will hang the GPU. -; CHECK-NOT: s_buffer_load_dword m0 -; CHECK: s_endpgm +; GCN-NOT: s_buffer_load_dword m0 +; GCN: s_endpgm + +; TOVGPR: ScratchSize: 0{{$}} define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 @@ -97,29 +99,114 @@ main_body: %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0 %tmp90 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7 %tmp91 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp90, !tbaa !0 - %tmp92 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6) - %tmp93 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6) - %tmp94 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6) - %tmp95 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6) - %tmp96 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6) - %tmp97 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6) - %tmp98 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6) - %tmp99 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6) - %tmp100 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6) - %tmp101 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6) - %tmp102 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6) - %tmp103 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6) - %tmp104 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6) - %tmp105 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6) - %tmp106 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6) - %tmp107 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6) - %tmp108 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6) + %i.i = extractelement <2 x i32> %arg6, i32 0 + %j.i = extractelement <2 x i32> %arg6, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg4) #0 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg4) #0 + %i.i91 = extractelement <2 x i32> %arg6, i32 0 + %j.i92 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i93 = bitcast i32 %i.i91 to float + %j.f.i94 = bitcast i32 %j.i92 to float + %p1.i95 = call float @llvm.amdgcn.interp.p1(float %i.f.i93, i32 1, i32 0, i32 %arg4) #0 + %p2.i96 = call float @llvm.amdgcn.interp.p2(float %p1.i95, float %j.f.i94, i32 1, i32 0, i32 %arg4) #0 + %i.i85 = extractelement <2 x i32> %arg6, i32 0 + %j.i86 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i87 = bitcast i32 %i.i85 to float + %j.f.i88 = bitcast i32 %j.i86 to float + %p1.i89 = call float @llvm.amdgcn.interp.p1(float %i.f.i87, i32 0, i32 1, i32 %arg4) #0 + %p2.i90 = call float @llvm.amdgcn.interp.p2(float %p1.i89, float %j.f.i88, i32 0, i32 1, i32 %arg4) #0 + %i.i79 = extractelement <2 x i32> %arg6, i32 0 + %j.i80 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i81 = bitcast i32 %i.i79 to float + %j.f.i82 = bitcast i32 %j.i80 to float + %p1.i83 = call float @llvm.amdgcn.interp.p1(float %i.f.i81, i32 1, i32 1, i32 %arg4) #0 + %p2.i84 = call float @llvm.amdgcn.interp.p2(float %p1.i83, float %j.f.i82, i32 1, i32 1, i32 %arg4) #0 + %i.i73 = extractelement <2 x i32> %arg6, i32 0 + %j.i74 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i75 = bitcast i32 %i.i73 to float + %j.f.i76 = bitcast i32 %j.i74 to float + %p1.i77 = call float @llvm.amdgcn.interp.p1(float %i.f.i75, i32 2, i32 1, i32 %arg4) #0 + %p2.i78 = call float @llvm.amdgcn.interp.p2(float %p1.i77, float %j.f.i76, i32 2, i32 1, i32 %arg4) #0 + %i.i67 = extractelement <2 x i32> %arg6, i32 0 + %j.i68 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i69 = bitcast i32 %i.i67 to float + %j.f.i70 = bitcast i32 %j.i68 to float + %p1.i71 = call float @llvm.amdgcn.interp.p1(float %i.f.i69, i32 0, i32 2, i32 %arg4) #0 + %p2.i72 = call float @llvm.amdgcn.interp.p2(float %p1.i71, float %j.f.i70, i32 0, i32 2, i32 %arg4) #0 + %i.i61 = extractelement <2 x i32> %arg6, i32 0 + %j.i62 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i63 = bitcast i32 %i.i61 to float + %j.f.i64 = bitcast i32 %j.i62 to float + %p1.i65 = call float @llvm.amdgcn.interp.p1(float %i.f.i63, i32 1, i32 2, i32 %arg4) #0 + %p2.i66 = call float @llvm.amdgcn.interp.p2(float %p1.i65, float %j.f.i64, i32 1, i32 2, i32 %arg4) #0 + %i.i55 = extractelement <2 x i32> %arg6, i32 0 + %j.i56 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i57 = bitcast i32 %i.i55 to float + %j.f.i58 = bitcast i32 %j.i56 to float + %p1.i59 = call float @llvm.amdgcn.interp.p1(float %i.f.i57, i32 2, i32 2, i32 %arg4) #0 + %p2.i60 = call float @llvm.amdgcn.interp.p2(float %p1.i59, float %j.f.i58, i32 2, i32 2, i32 %arg4) #0 + %i.i49 = extractelement <2 x i32> %arg6, i32 0 + %j.i50 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i51 = bitcast i32 %i.i49 to float + %j.f.i52 = bitcast i32 %j.i50 to float + %p1.i53 = call float @llvm.amdgcn.interp.p1(float %i.f.i51, i32 0, i32 3, i32 %arg4) #0 + %p2.i54 = call float @llvm.amdgcn.interp.p2(float %p1.i53, float %j.f.i52, i32 0, i32 3, i32 %arg4) #0 + %i.i43 = extractelement <2 x i32> %arg6, i32 0 + %j.i44 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i45 = bitcast i32 %i.i43 to float + %j.f.i46 = bitcast i32 %j.i44 to float + %p1.i47 = call float @llvm.amdgcn.interp.p1(float %i.f.i45, i32 1, i32 3, i32 %arg4) #0 + %p2.i48 = call float @llvm.amdgcn.interp.p2(float %p1.i47, float %j.f.i46, i32 1, i32 3, i32 %arg4) #0 + %i.i37 = extractelement <2 x i32> %arg6, i32 0 + %j.i38 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i39 = bitcast i32 %i.i37 to float + %j.f.i40 = bitcast i32 %j.i38 to float + %p1.i41 = call float @llvm.amdgcn.interp.p1(float %i.f.i39, i32 2, i32 3, i32 %arg4) #0 + %p2.i42 = call float @llvm.amdgcn.interp.p2(float %p1.i41, float %j.f.i40, i32 2, i32 3, i32 %arg4) #0 + %i.i31 = extractelement <2 x i32> %arg6, i32 0 + %j.i32 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i33 = bitcast i32 %i.i31 to float + %j.f.i34 = bitcast i32 %j.i32 to float + %p1.i35 = call float @llvm.amdgcn.interp.p1(float %i.f.i33, i32 0, i32 4, i32 %arg4) #0 + %p2.i36 = call float @llvm.amdgcn.interp.p2(float %p1.i35, float %j.f.i34, i32 0, i32 4, i32 %arg4) #0 + %i.i25 = extractelement <2 x i32> %arg6, i32 0 + %j.i26 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i27 = bitcast i32 %i.i25 to float + %j.f.i28 = bitcast i32 %j.i26 to float + %p1.i29 = call float @llvm.amdgcn.interp.p1(float %i.f.i27, i32 1, i32 4, i32 %arg4) #0 + %p2.i30 = call float @llvm.amdgcn.interp.p2(float %p1.i29, float %j.f.i28, i32 1, i32 4, i32 %arg4) #0 + %i.i19 = extractelement <2 x i32> %arg6, i32 0 + %j.i20 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i21 = bitcast i32 %i.i19 to float + %j.f.i22 = bitcast i32 %j.i20 to float + %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 2, i32 4, i32 %arg4) #0 + %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 2, i32 4, i32 %arg4) #0 + %i.i13 = extractelement <2 x i32> %arg6, i32 0 + %j.i14 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i15 = bitcast i32 %i.i13 to float + %j.f.i16 = bitcast i32 %j.i14 to float + %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 5, i32 %arg4) #0 + %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 5, i32 %arg4) #0 + %i.i7 = extractelement <2 x i32> %arg6, i32 0 + %j.i8 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i9 = bitcast i32 %i.i7 to float + %j.f.i10 = bitcast i32 %j.i8 to float + %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 5, i32 %arg4) #0 + %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 5, i32 %arg4) #0 + %i.i1 = extractelement <2 x i32> %arg6, i32 0 + %j.i2 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i3 = bitcast i32 %i.i1 to float + %j.f.i4 = bitcast i32 %j.i2 to float + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 5, i32 %arg4) #0 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0 %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0) %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109 - %tmp111 = bitcast float %tmp92 to i32 + %tmp111 = bitcast float %p2.i to i32 store i32 %tmp111, i32 addrspace(3)* %tmp110 - %tmp112 = bitcast float %tmp93 to i32 + %tmp112 = bitcast float %p2.i96 to i32 store i32 %tmp112, i32 addrspace(3)* %tmp110 %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1) @@ -128,14 +215,14 @@ main_body: %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115 %tmp117 = add i32 %tmp115, 1 %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117 - %tmp119 = bitcast float %tmp92 to i32 + %tmp119 = bitcast float %p2.i to i32 store i32 %tmp119, i32 addrspace(3)* %tmp114 %tmp120 = load i32, i32 addrspace(3)* %tmp116 %tmp121 = bitcast i32 %tmp120 to float %tmp122 = load i32, i32 addrspace(3)* %tmp118 %tmp123 = bitcast i32 %tmp122 to float %tmp124 = fsub float %tmp123, %tmp121 - %tmp125 = bitcast float %tmp93 to i32 + %tmp125 = bitcast float %p2.i96 to i32 store i32 %tmp125, i32 addrspace(3)* %tmp114 %tmp126 = load i32, i32 addrspace(3)* %tmp116 %tmp127 = bitcast i32 %tmp126 to float @@ -148,10 +235,10 @@ main_body: %tmp134 = insertelement <4 x float> %tmp133, float %tmp130, i32 3 %tmp135 = extractelement <4 x float> %tmp134, i32 0 %tmp136 = extractelement <4 x float> %tmp134, i32 1 - %tmp137 = fmul float %tmp59, %tmp92 - %tmp138 = fmul float %tmp59, %tmp93 - %tmp139 = fmul float %tmp59, %tmp93 - %tmp140 = fmul float %tmp59, %tmp93 + %tmp137 = fmul float %tmp59, %p2.i + %tmp138 = fmul float %tmp59, %p2.i96 + %tmp139 = fmul float %tmp59, %p2.i96 + %tmp140 = fmul float %tmp59, %p2.i96 %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2) %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141 @@ -204,26 +291,26 @@ main_body: %tmp180 = insertelement <4 x float> %tmp179, float %tmp176, i32 3 %tmp181 = extractelement <4 x float> %tmp180, i32 0 %tmp182 = extractelement <4 x float> %tmp180, i32 1 - %tmp183 = fdiv float 1.000000e+00, %tmp96 + %tmp183 = fdiv float 1.000000e+00, %p2.i78 %tmp184 = fmul float %tmp32, %tmp183 %tmp185 = fcmp uge float 1.000000e+00, %tmp184 %tmp186 = select i1 %tmp185, float %tmp184, float 1.000000e+00 %tmp187 = fmul float %tmp186, %tmp29 - %tmp188 = call float @ceil(float %tmp187) + %tmp188 = call float @llvm.ceil.f32(float %tmp187) %tmp189 = fcmp uge float 3.000000e+00, %tmp188 %tmp190 = select i1 %tmp189, float 3.000000e+00, float %tmp188 %tmp191 = fdiv float 1.000000e+00, %tmp190 %tmp192 = fdiv float 1.000000e+00, %tmp29 %tmp193 = fmul float %tmp190, %tmp192 %tmp194 = fmul float %tmp30, %tmp193 - %tmp195 = fmul float %tmp94, %tmp94 - %tmp196 = fmul float %tmp95, %tmp95 + %tmp195 = fmul float %p2.i90, %p2.i90 + %tmp196 = fmul float %p2.i84, %p2.i84 %tmp197 = fadd float %tmp196, %tmp195 - %tmp198 = fmul float %tmp96, %tmp96 + %tmp198 = fmul float %p2.i78, %p2.i78 %tmp199 = fadd float %tmp197, %tmp198 %tmp200 = call float @llvm.amdgcn.rsq.f32(float %tmp199) - %tmp201 = fmul float %tmp94, %tmp200 - %tmp202 = fmul float %tmp95, %tmp200 + %tmp201 = fmul float %p2.i90, %tmp200 + %tmp202 = fmul float %p2.i84, %tmp200 %tmp203 = fmul float %tmp201, %tmp28 %tmp204 = fmul float %tmp202, %tmp28 %tmp205 = fmul float %tmp203, -1.000000e+00 @@ -231,9 +318,9 @@ main_body: %tmp207 = fmul float %tmp205, %tmp31 %tmp208 = fmul float %tmp206, %tmp31 %tmp209 = fsub float -0.000000e+00, %tmp207 - %tmp210 = fadd float %tmp92, %tmp209 + %tmp210 = fadd float %p2.i, %tmp209 %tmp211 = fsub float -0.000000e+00, %tmp208 - %tmp212 = fadd float %tmp93, %tmp211 + %tmp212 = fadd float %p2.i96, %tmp211 %tmp213 = fmul float %tmp205, %tmp191 %tmp214 = fmul float %tmp206, %tmp191 %tmp215 = fmul float -1.000000e+00, %tmp191 @@ -277,7 +364,8 @@ ENDIF: ; preds = %LOOP %tmp240 = insertelement <8 x i32> %tmp239, i32 %tmp238, i32 5 %tmp241 = insertelement <8 x i32> %tmp240, i32 undef, i32 6 %tmp242 = insertelement <8 x i32> %tmp241, i32 undef, i32 7 - %tmp243 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp242, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp242.bc = bitcast <8 x i32> %tmp242 to <8 x float> + %tmp243 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp242.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp244 = extractelement <4 x float> %tmp243, i32 3 %tmp245 = fcmp oge float %temp30.0, %tmp244 %tmp246 = sext i1 %tmp245 to i32 @@ -323,7 +411,8 @@ IF67: ; preds = %LOOP65 %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6 %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7 %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32> - %tmp277 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp276, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp276.bc = bitcast <8 x i32> %tmp276 to <8 x float> + %tmp277 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp276.bc, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp278 = extractelement <4 x float> %tmp277, i32 0 %tmp279 = extractelement <4 x float> %tmp277, i32 1 %tmp280 = extractelement <4 x float> %tmp277, i32 2 @@ -344,7 +433,8 @@ IF67: ; preds = %LOOP65 %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6 %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7 %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32> - %tmp297 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp296, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp296.bc = bitcast <8 x i32> %tmp296 to <8 x float> + %tmp297 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp296.bc, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp298 = extractelement <4 x float> %tmp297, i32 0 %tmp299 = extractelement <4 x float> %tmp297, i32 1 %tmp300 = extractelement <4 x float> %tmp297, i32 2 @@ -363,7 +453,8 @@ IF67: ; preds = %LOOP65 %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6 %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7 %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32> - %tmp315 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp314, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp314.bc = bitcast <8 x i32> %tmp314 to <8 x float> + %tmp315 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp314.bc, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp316 = extractelement <4 x float> %tmp315, i32 0 %tmp317 = extractelement <4 x float> %tmp315, i32 1 %tmp318 = extractelement <4 x float> %tmp315, i32 2 @@ -393,7 +484,8 @@ IF67: ; preds = %LOOP65 %tmp342 = insertelement <8 x i32> %tmp341, i32 %tmp336, i32 5 %tmp343 = insertelement <8 x i32> %tmp342, i32 undef, i32 6 %tmp344 = insertelement <8 x i32> %tmp343, i32 undef, i32 7 - %tmp345 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp344, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp344.bc = bitcast <8 x i32> %tmp344 to <8 x float> + %tmp345 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp344.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp346 = extractelement <4 x float> %tmp345, i32 0 %tmp347 = extractelement <4 x float> %tmp345, i32 1 %tmp348 = extractelement <4 x float> %tmp345, i32 2 @@ -424,14 +516,15 @@ IF67: ; preds = %LOOP65 %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6 %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7 %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32> - %tmp375 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp374, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp374.bc = bitcast <8 x i32> %tmp374 to <8 x float> + %tmp375 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp374.bc, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp376 = extractelement <4 x float> %tmp375, i32 0 %tmp377 = extractelement <4 x float> %tmp375, i32 1 %tmp378 = extractelement <4 x float> %tmp375, i32 2 %tmp379 = extractelement <4 x float> %tmp375, i32 3 - %tmp380 = fsub float -0.000000e+00, %tmp94 - %tmp381 = fsub float -0.000000e+00, %tmp95 - %tmp382 = fsub float -0.000000e+00, %tmp96 + %tmp380 = fsub float -0.000000e+00, %p2.i90 + %tmp381 = fsub float -0.000000e+00, %p2.i84 + %tmp382 = fsub float -0.000000e+00, %p2.i78 %tmp383 = fmul float %tmp358, %tmp380 %tmp384 = fmul float %tmp359, %tmp381 %tmp385 = fadd float %tmp384, %tmp383 @@ -449,20 +542,20 @@ IF67: ; preds = %LOOP65 %tmp397 = fadd float %tmp381, %tmp396 %tmp398 = fsub float -0.000000e+00, %tmp393 %tmp399 = fadd float %tmp382, %tmp398 - %tmp400 = fmul float %tmp395, %tmp97 - %tmp401 = fmul float %tmp395, %tmp98 - %tmp402 = fmul float %tmp395, %tmp99 - %tmp403 = fmul float %tmp397, %tmp100 + %tmp400 = fmul float %tmp395, %p2.i72 + %tmp401 = fmul float %tmp395, %p2.i66 + %tmp402 = fmul float %tmp395, %p2.i60 + %tmp403 = fmul float %tmp397, %p2.i54 %tmp404 = fadd float %tmp403, %tmp400 - %tmp405 = fmul float %tmp397, %tmp101 + %tmp405 = fmul float %tmp397, %p2.i48 %tmp406 = fadd float %tmp405, %tmp401 - %tmp407 = fmul float %tmp397, %tmp102 + %tmp407 = fmul float %tmp397, %p2.i42 %tmp408 = fadd float %tmp407, %tmp402 - %tmp409 = fmul float %tmp399, %tmp103 + %tmp409 = fmul float %tmp399, %p2.i36 %tmp410 = fadd float %tmp409, %tmp404 - %tmp411 = fmul float %tmp399, %tmp104 + %tmp411 = fmul float %tmp399, %p2.i30 %tmp412 = fadd float %tmp411, %tmp406 - %tmp413 = fmul float %tmp399, %tmp105 + %tmp413 = fmul float %tmp399, %p2.i24 %tmp414 = fadd float %tmp413, %tmp408 %tmp415 = bitcast float %tmp135 to i32 %tmp416 = bitcast float %tmp181 to i32 @@ -479,7 +572,8 @@ IF67: ; preds = %LOOP65 %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6 %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7 %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32> - %tmp429 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp428, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp428.bc = bitcast <8 x i32> %tmp428 to <8 x float> + %tmp429 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp428.bc, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp430 = extractelement <4 x float> %tmp429, i32 0 %tmp431 = extractelement <4 x float> %tmp429, i32 1 %tmp432 = extractelement <4 x float> %tmp429, i32 2 @@ -502,12 +596,22 @@ IF67: ; preds = %LOOP65 %tmp449 = insertelement <4 x float> %tmp448, float %tmp445, i32 1 %tmp450 = insertelement <4 x float> %tmp449, float %tmp447, i32 2 %tmp451 = insertelement <4 x float> %tmp450, float %tmp194, i32 3 - %tmp452 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp451) + %tmp451.x = extractelement <4 x float> %tmp451, i32 0 + %tmp451.y = extractelement <4 x float> %tmp451, i32 1 + %tmp451.z = extractelement <4 x float> %tmp451, i32 2 + %cubetc = call float @llvm.amdgcn.cubetc(float %tmp451.x, float %tmp451.y, float %tmp451.z) + %cubesc = call float @llvm.amdgcn.cubesc(float %tmp451.x, float %tmp451.y, float %tmp451.z) + %cubema = call float @llvm.amdgcn.cubema(float %tmp451.x, float %tmp451.y, float %tmp451.z) + %cubeid = call float @llvm.amdgcn.cubeid(float %tmp451.x, float %tmp451.y, float %tmp451.z) + %tmp452.0 = insertelement <4 x float> undef, float %cubetc, i32 0 + %tmp452.1 = insertelement <4 x float> %tmp452.0, float %cubesc, i32 1 + %tmp452.2 = insertelement <4 x float> %tmp452.1, float %cubema, i32 2 + %tmp452 = insertelement <4 x float> %tmp452.2, float %cubeid, i32 3 %tmp453 = extractelement <4 x float> %tmp452, i32 0 %tmp454 = extractelement <4 x float> %tmp452, i32 1 %tmp455 = extractelement <4 x float> %tmp452, i32 2 %tmp456 = extractelement <4 x float> %tmp452, i32 3 - %tmp457 = call float @fabs(float %tmp455) + %tmp457 = call float @llvm.fabs.f32(float %tmp455) %tmp458 = fdiv float 1.000000e+00, %tmp457 %tmp459 = fmul float %tmp453, %tmp458 %tmp460 = fadd float %tmp459, 1.500000e+00 @@ -521,7 +625,8 @@ IF67: ; preds = %LOOP65 %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2 %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3 %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32> - %tmp470 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp469, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp469.bc = bitcast <4 x i32> %tmp469 to <4 x float> + %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp469.bc, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %tmp471 = extractelement <4 x float> %tmp470, i32 0 %tmp472 = extractelement <4 x float> %tmp470, i32 1 %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -531,15 +636,15 @@ IF67: ; preds = %LOOP65 %tmp477 = fadd float %tmp476, %tmp329 %tmp478 = fmul float %tmp432, %tmp473 %tmp479 = fadd float %tmp478, %tmp330 - %tmp480 = fmul float %tmp106, %tmp106 - %tmp481 = fmul float %tmp107, %tmp107 + %tmp480 = fmul float %p2.i18, %p2.i18 + %tmp481 = fmul float %p2.i12, %p2.i12 %tmp482 = fadd float %tmp481, %tmp480 - %tmp483 = fmul float %tmp108, %tmp108 + %tmp483 = fmul float %p2.i6, %p2.i6 %tmp484 = fadd float %tmp482, %tmp483 %tmp485 = call float @llvm.amdgcn.rsq.f32(float %tmp484) - %tmp486 = fmul float %tmp106, %tmp485 - %tmp487 = fmul float %tmp107, %tmp485 - %tmp488 = fmul float %tmp108, %tmp485 + %tmp486 = fmul float %p2.i18, %tmp485 + %tmp487 = fmul float %p2.i12, %tmp485 + %tmp488 = fmul float %p2.i6, %tmp485 %tmp489 = fmul float %tmp376, %tmp39 %tmp490 = fmul float %tmp377, %tmp40 %tmp491 = fmul float %tmp378, %tmp41 @@ -560,15 +665,15 @@ IF67: ; preds = %LOOP65 %tmp506 = fadd float %tmp487, %tmp505 %tmp507 = fsub float -0.000000e+00, %tmp502 %tmp508 = fadd float %tmp488, %tmp507 - %tmp509 = fmul float %tmp94, %tmp94 - %tmp510 = fmul float %tmp95, %tmp95 + %tmp509 = fmul float %p2.i90, %p2.i90 + %tmp510 = fmul float %p2.i84, %p2.i84 %tmp511 = fadd float %tmp510, %tmp509 - %tmp512 = fmul float %tmp96, %tmp96 + %tmp512 = fmul float %p2.i78, %p2.i78 %tmp513 = fadd float %tmp511, %tmp512 %tmp514 = call float @llvm.amdgcn.rsq.f32(float %tmp513) - %tmp515 = fmul float %tmp94, %tmp514 - %tmp516 = fmul float %tmp95, %tmp514 - %tmp517 = fmul float %tmp96, %tmp514 + %tmp515 = fmul float %p2.i90, %tmp514 + %tmp516 = fmul float %p2.i84, %tmp514 + %tmp517 = fmul float %p2.i78, %tmp514 %tmp518 = fmul float %tmp504, %tmp515 %tmp519 = fmul float %tmp506, %tmp516 %tmp520 = fadd float %tmp519, %tmp518 @@ -623,7 +728,8 @@ IF67: ; preds = %LOOP65 %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6 %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7 %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32> - %tmp571 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp570, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp570.bc = bitcast <8 x i32> %tmp570 to <8 x float> + %tmp571 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp570.bc, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp572 = extractelement <4 x float> %tmp571, i32 0 %tmp573 = extractelement <4 x float> %tmp571, i32 1 %tmp574 = extractelement <4 x float> %tmp571, i32 2 @@ -633,11 +739,9 @@ IF67: ; preds = %LOOP65 %tmp578 = fadd float %tmp577, %tmp554 %tmp579 = fmul float %tmp574, %tmp45 %tmp580 = fadd float %tmp579, %tmp556 - %tmp581 = call i32 @llvm.SI.packf16(float %tmp576, float %tmp578) - %tmp582 = bitcast i32 %tmp581 to float - %tmp583 = call i32 @llvm.SI.packf16(float %tmp580, float %tmp282) - %tmp584 = bitcast i32 %tmp583 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp582, float %tmp584, float %tmp582, float %tmp584) + %tmp581 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp576, float %tmp578) + %tmp583 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp580, float %tmp282) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp581, <2 x half> %tmp583, i1 true, i1 true) #0 ret void ENDIF66: ; preds = %LOOP65 @@ -647,7 +751,8 @@ ENDIF66: ; preds = %LOOP65 %tmp588 = insertelement <8 x i32> %tmp587, i32 %tmp586, i32 5 %tmp589 = insertelement <8 x i32> %tmp588, i32 undef, i32 6 %tmp590 = insertelement <8 x i32> %tmp589, i32 undef, i32 7 - %tmp591 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp590, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp590.bc = bitcast <8 x i32> %tmp590 to <8 x float> + %tmp591 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp590.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp592 = extractelement <4 x float> %tmp591, i32 3 %tmp593 = fcmp oge float %temp30.1, %tmp592 %tmp594 = sext i1 %tmp593 to i32 @@ -670,9 +775,10 @@ ENDIF66: ; preds = %LOOP65 br label %LOOP65 } -; CHECK-LABEL: {{^}}main1: -; CHECK: s_endpgm -define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { +; GCN-LABEL: {{^}}main1: +; GCN: s_endpgm +; TOVGPR: ScratchSize: 0{{$}} +define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -817,52 +923,210 @@ main_body: %tmp160 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp159, !tbaa !0 %tmp161 = fcmp ugt float %arg17, 0.000000e+00 %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00 - %tmp163 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6) - %tmp164 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6) - %tmp165 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %arg4, <2 x i32> %arg6) - %tmp166 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg4, <2 x i32> %arg6) - %tmp167 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6) - %tmp168 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6) - %tmp169 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6) - %tmp170 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %arg4, <2 x i32> %arg6) - %tmp171 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6) - %tmp172 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6) - %tmp173 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6) - %tmp174 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %arg4, <2 x i32> %arg6) - %tmp175 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6) - %tmp176 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6) - %tmp177 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6) - %tmp178 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %arg4, <2 x i32> %arg6) - %tmp179 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6) - %tmp180 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6) - %tmp181 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6) - %tmp182 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg4, <2 x i32> %arg6) - %tmp183 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6) - %tmp184 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6) - %tmp185 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6) - %tmp186 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %arg4, <2 x i32> %arg6) - %tmp187 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %arg4, <2 x i32> %arg6) - %tmp188 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %arg4, <2 x i32> %arg6) - %tmp189 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %arg4, <2 x i32> %arg6) - %tmp190 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %arg4, <2 x i32> %arg6) - %tmp191 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %arg4, <2 x i32> %arg6) - %tmp192 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %arg4, <2 x i32> %arg6) - %tmp193 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %arg4, <2 x i32> %arg6) - %tmp194 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %arg4, <2 x i32> %arg6) + %i.i = extractelement <2 x i32> %arg6, i32 0 + %j.i = extractelement <2 x i32> %arg6, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg4) #0 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg4) #0 + %i.i181 = extractelement <2 x i32> %arg6, i32 0 + %j.i182 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i183 = bitcast i32 %i.i181 to float + %j.f.i184 = bitcast i32 %j.i182 to float + %p1.i185 = call float @llvm.amdgcn.interp.p1(float %i.f.i183, i32 1, i32 0, i32 %arg4) #0 + %p2.i186 = call float @llvm.amdgcn.interp.p2(float %p1.i185, float %j.f.i184, i32 1, i32 0, i32 %arg4) #0 + %i.i175 = extractelement <2 x i32> %arg6, i32 0 + %j.i176 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i177 = bitcast i32 %i.i175 to float + %j.f.i178 = bitcast i32 %j.i176 to float + %p1.i179 = call float @llvm.amdgcn.interp.p1(float %i.f.i177, i32 2, i32 0, i32 %arg4) #0 + %p2.i180 = call float @llvm.amdgcn.interp.p2(float %p1.i179, float %j.f.i178, i32 2, i32 0, i32 %arg4) #0 + %i.i169 = extractelement <2 x i32> %arg6, i32 0 + %j.i170 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i171 = bitcast i32 %i.i169 to float + %j.f.i172 = bitcast i32 %j.i170 to float + %p1.i173 = call float @llvm.amdgcn.interp.p1(float %i.f.i171, i32 3, i32 0, i32 %arg4) #0 + %p2.i174 = call float @llvm.amdgcn.interp.p2(float %p1.i173, float %j.f.i172, i32 3, i32 0, i32 %arg4) #0 + %i.i163 = extractelement <2 x i32> %arg6, i32 0 + %j.i164 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i165 = bitcast i32 %i.i163 to float + %j.f.i166 = bitcast i32 %j.i164 to float + %p1.i167 = call float @llvm.amdgcn.interp.p1(float %i.f.i165, i32 0, i32 1, i32 %arg4) #0 + %p2.i168 = call float @llvm.amdgcn.interp.p2(float %p1.i167, float %j.f.i166, i32 0, i32 1, i32 %arg4) #0 + %i.i157 = extractelement <2 x i32> %arg6, i32 0 + %j.i158 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i159 = bitcast i32 %i.i157 to float + %j.f.i160 = bitcast i32 %j.i158 to float + %p1.i161 = call float @llvm.amdgcn.interp.p1(float %i.f.i159, i32 1, i32 1, i32 %arg4) #0 + %p2.i162 = call float @llvm.amdgcn.interp.p2(float %p1.i161, float %j.f.i160, i32 1, i32 1, i32 %arg4) #0 + %i.i151 = extractelement <2 x i32> %arg6, i32 0 + %j.i152 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i153 = bitcast i32 %i.i151 to float + %j.f.i154 = bitcast i32 %j.i152 to float + %p1.i155 = call float @llvm.amdgcn.interp.p1(float %i.f.i153, i32 2, i32 1, i32 %arg4) #0 + %p2.i156 = call float @llvm.amdgcn.interp.p2(float %p1.i155, float %j.f.i154, i32 2, i32 1, i32 %arg4) #0 + %i.i145 = extractelement <2 x i32> %arg6, i32 0 + %j.i146 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i147 = bitcast i32 %i.i145 to float + %j.f.i148 = bitcast i32 %j.i146 to float + %p1.i149 = call float @llvm.amdgcn.interp.p1(float %i.f.i147, i32 3, i32 1, i32 %arg4) #0 + %p2.i150 = call float @llvm.amdgcn.interp.p2(float %p1.i149, float %j.f.i148, i32 3, i32 1, i32 %arg4) #0 + %i.i139 = extractelement <2 x i32> %arg6, i32 0 + %j.i140 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i141 = bitcast i32 %i.i139 to float + %j.f.i142 = bitcast i32 %j.i140 to float + %p1.i143 = call float @llvm.amdgcn.interp.p1(float %i.f.i141, i32 0, i32 2, i32 %arg4) #0 + %p2.i144 = call float @llvm.amdgcn.interp.p2(float %p1.i143, float %j.f.i142, i32 0, i32 2, i32 %arg4) #0 + %i.i133 = extractelement <2 x i32> %arg6, i32 0 + %j.i134 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i135 = bitcast i32 %i.i133 to float + %j.f.i136 = bitcast i32 %j.i134 to float + %p1.i137 = call float @llvm.amdgcn.interp.p1(float %i.f.i135, i32 1, i32 2, i32 %arg4) #0 + %p2.i138 = call float @llvm.amdgcn.interp.p2(float %p1.i137, float %j.f.i136, i32 1, i32 2, i32 %arg4) #0 + %i.i127 = extractelement <2 x i32> %arg6, i32 0 + %j.i128 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i129 = bitcast i32 %i.i127 to float + %j.f.i130 = bitcast i32 %j.i128 to float + %p1.i131 = call float @llvm.amdgcn.interp.p1(float %i.f.i129, i32 2, i32 2, i32 %arg4) #0 + %p2.i132 = call float @llvm.amdgcn.interp.p2(float %p1.i131, float %j.f.i130, i32 2, i32 2, i32 %arg4) #0 + %i.i121 = extractelement <2 x i32> %arg6, i32 0 + %j.i122 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i123 = bitcast i32 %i.i121 to float + %j.f.i124 = bitcast i32 %j.i122 to float + %p1.i125 = call float @llvm.amdgcn.interp.p1(float %i.f.i123, i32 3, i32 2, i32 %arg4) #0 + %p2.i126 = call float @llvm.amdgcn.interp.p2(float %p1.i125, float %j.f.i124, i32 3, i32 2, i32 %arg4) #0 + %i.i115 = extractelement <2 x i32> %arg6, i32 0 + %j.i116 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i117 = bitcast i32 %i.i115 to float + %j.f.i118 = bitcast i32 %j.i116 to float + %p1.i119 = call float @llvm.amdgcn.interp.p1(float %i.f.i117, i32 0, i32 3, i32 %arg4) #0 + %p2.i120 = call float @llvm.amdgcn.interp.p2(float %p1.i119, float %j.f.i118, i32 0, i32 3, i32 %arg4) #0 + %i.i109 = extractelement <2 x i32> %arg6, i32 0 + %j.i110 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i111 = bitcast i32 %i.i109 to float + %j.f.i112 = bitcast i32 %j.i110 to float + %p1.i113 = call float @llvm.amdgcn.interp.p1(float %i.f.i111, i32 1, i32 3, i32 %arg4) #0 + %p2.i114 = call float @llvm.amdgcn.interp.p2(float %p1.i113, float %j.f.i112, i32 1, i32 3, i32 %arg4) #0 + %i.i103 = extractelement <2 x i32> %arg6, i32 0 + %j.i104 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i105 = bitcast i32 %i.i103 to float + %j.f.i106 = bitcast i32 %j.i104 to float + %p1.i107 = call float @llvm.amdgcn.interp.p1(float %i.f.i105, i32 2, i32 3, i32 %arg4) #0 + %p2.i108 = call float @llvm.amdgcn.interp.p2(float %p1.i107, float %j.f.i106, i32 2, i32 3, i32 %arg4) #0 + %i.i97 = extractelement <2 x i32> %arg6, i32 0 + %j.i98 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i99 = bitcast i32 %i.i97 to float + %j.f.i100 = bitcast i32 %j.i98 to float + %p1.i101 = call float @llvm.amdgcn.interp.p1(float %i.f.i99, i32 3, i32 3, i32 %arg4) #0 + %p2.i102 = call float @llvm.amdgcn.interp.p2(float %p1.i101, float %j.f.i100, i32 3, i32 3, i32 %arg4) #0 + %i.i91 = extractelement <2 x i32> %arg6, i32 0 + %j.i92 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i93 = bitcast i32 %i.i91 to float + %j.f.i94 = bitcast i32 %j.i92 to float + %p1.i95 = call float @llvm.amdgcn.interp.p1(float %i.f.i93, i32 0, i32 4, i32 %arg4) #0 + %p2.i96 = call float @llvm.amdgcn.interp.p2(float %p1.i95, float %j.f.i94, i32 0, i32 4, i32 %arg4) #0 + %i.i85 = extractelement <2 x i32> %arg6, i32 0 + %j.i86 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i87 = bitcast i32 %i.i85 to float + %j.f.i88 = bitcast i32 %j.i86 to float + %p1.i89 = call float @llvm.amdgcn.interp.p1(float %i.f.i87, i32 1, i32 4, i32 %arg4) #0 + %p2.i90 = call float @llvm.amdgcn.interp.p2(float %p1.i89, float %j.f.i88, i32 1, i32 4, i32 %arg4) #0 + %i.i79 = extractelement <2 x i32> %arg6, i32 0 + %j.i80 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i81 = bitcast i32 %i.i79 to float + %j.f.i82 = bitcast i32 %j.i80 to float + %p1.i83 = call float @llvm.amdgcn.interp.p1(float %i.f.i81, i32 2, i32 4, i32 %arg4) #0 + %p2.i84 = call float @llvm.amdgcn.interp.p2(float %p1.i83, float %j.f.i82, i32 2, i32 4, i32 %arg4) #0 + %i.i73 = extractelement <2 x i32> %arg6, i32 0 + %j.i74 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i75 = bitcast i32 %i.i73 to float + %j.f.i76 = bitcast i32 %j.i74 to float + %p1.i77 = call float @llvm.amdgcn.interp.p1(float %i.f.i75, i32 3, i32 4, i32 %arg4) #0 + %p2.i78 = call float @llvm.amdgcn.interp.p2(float %p1.i77, float %j.f.i76, i32 3, i32 4, i32 %arg4) #0 + %i.i67 = extractelement <2 x i32> %arg6, i32 0 + %j.i68 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i69 = bitcast i32 %i.i67 to float + %j.f.i70 = bitcast i32 %j.i68 to float + %p1.i71 = call float @llvm.amdgcn.interp.p1(float %i.f.i69, i32 0, i32 5, i32 %arg4) #0 + %p2.i72 = call float @llvm.amdgcn.interp.p2(float %p1.i71, float %j.f.i70, i32 0, i32 5, i32 %arg4) #0 + %i.i61 = extractelement <2 x i32> %arg6, i32 0 + %j.i62 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i63 = bitcast i32 %i.i61 to float + %j.f.i64 = bitcast i32 %j.i62 to float + %p1.i65 = call float @llvm.amdgcn.interp.p1(float %i.f.i63, i32 1, i32 5, i32 %arg4) #0 + %p2.i66 = call float @llvm.amdgcn.interp.p2(float %p1.i65, float %j.f.i64, i32 1, i32 5, i32 %arg4) #0 + %i.i55 = extractelement <2 x i32> %arg6, i32 0 + %j.i56 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i57 = bitcast i32 %i.i55 to float + %j.f.i58 = bitcast i32 %j.i56 to float + %p1.i59 = call float @llvm.amdgcn.interp.p1(float %i.f.i57, i32 2, i32 5, i32 %arg4) #0 + %p2.i60 = call float @llvm.amdgcn.interp.p2(float %p1.i59, float %j.f.i58, i32 2, i32 5, i32 %arg4) #0 + %i.i49 = extractelement <2 x i32> %arg6, i32 0 + %j.i50 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i51 = bitcast i32 %i.i49 to float + %j.f.i52 = bitcast i32 %j.i50 to float + %p1.i53 = call float @llvm.amdgcn.interp.p1(float %i.f.i51, i32 3, i32 5, i32 %arg4) #0 + %p2.i54 = call float @llvm.amdgcn.interp.p2(float %p1.i53, float %j.f.i52, i32 3, i32 5, i32 %arg4) #0 + %i.i43 = extractelement <2 x i32> %arg6, i32 0 + %j.i44 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i45 = bitcast i32 %i.i43 to float + %j.f.i46 = bitcast i32 %j.i44 to float + %p1.i47 = call float @llvm.amdgcn.interp.p1(float %i.f.i45, i32 0, i32 6, i32 %arg4) #0 + %p2.i48 = call float @llvm.amdgcn.interp.p2(float %p1.i47, float %j.f.i46, i32 0, i32 6, i32 %arg4) #0 + %i.i37 = extractelement <2 x i32> %arg6, i32 0 + %j.i38 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i39 = bitcast i32 %i.i37 to float + %j.f.i40 = bitcast i32 %j.i38 to float + %p1.i41 = call float @llvm.amdgcn.interp.p1(float %i.f.i39, i32 1, i32 6, i32 %arg4) #0 + %p2.i42 = call float @llvm.amdgcn.interp.p2(float %p1.i41, float %j.f.i40, i32 1, i32 6, i32 %arg4) #0 + %i.i31 = extractelement <2 x i32> %arg6, i32 0 + %j.i32 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i33 = bitcast i32 %i.i31 to float + %j.f.i34 = bitcast i32 %j.i32 to float + %p1.i35 = call float @llvm.amdgcn.interp.p1(float %i.f.i33, i32 2, i32 6, i32 %arg4) #0 + %p2.i36 = call float @llvm.amdgcn.interp.p2(float %p1.i35, float %j.f.i34, i32 2, i32 6, i32 %arg4) #0 + %i.i25 = extractelement <2 x i32> %arg6, i32 0 + %j.i26 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i27 = bitcast i32 %i.i25 to float + %j.f.i28 = bitcast i32 %j.i26 to float + %p1.i29 = call float @llvm.amdgcn.interp.p1(float %i.f.i27, i32 3, i32 6, i32 %arg4) #0 + %p2.i30 = call float @llvm.amdgcn.interp.p2(float %p1.i29, float %j.f.i28, i32 3, i32 6, i32 %arg4) #0 + %i.i19 = extractelement <2 x i32> %arg6, i32 0 + %j.i20 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i21 = bitcast i32 %i.i19 to float + %j.f.i22 = bitcast i32 %j.i20 to float + %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 0, i32 7, i32 %arg4) #0 + %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 0, i32 7, i32 %arg4) #0 + %i.i13 = extractelement <2 x i32> %arg6, i32 0 + %j.i14 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i15 = bitcast i32 %i.i13 to float + %j.f.i16 = bitcast i32 %j.i14 to float + %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 1, i32 7, i32 %arg4) #0 + %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 1, i32 7, i32 %arg4) #0 + %i.i7 = extractelement <2 x i32> %arg6, i32 0 + %j.i8 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i9 = bitcast i32 %i.i7 to float + %j.f.i10 = bitcast i32 %j.i8 to float + %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 2, i32 7, i32 %arg4) #0 + %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 2, i32 7, i32 %arg4) #0 + %i.i1 = extractelement <2 x i32> %arg6, i32 0 + %j.i2 = extractelement <2 x i32> %arg6, i32 1 + %i.f.i3 = bitcast i32 %i.i1 to float + %j.f.i4 = bitcast i32 %j.i2 to float + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 3, i32 7, i32 %arg4) #0 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 3, i32 7, i32 %arg4) #0 %tmp195 = fmul float %arg14, %tmp123 %tmp196 = fadd float %tmp195, %tmp124 - %tmp197 = call float @llvm.AMDGPU.clamp.f32(float %tmp162, float 0.000000e+00, float 1.000000e+00) - %tmp198 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) - %tmp199 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) - %tmp200 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %tmp201 = bitcast float %tmp197 to i32 + %max.0.i = call float @llvm.maxnum.f32(float %tmp162, float 0.000000e+00) + %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00) + %tmp201 = bitcast float %clamp.i to i32 %tmp202 = icmp ne i32 %tmp201, 0 %. = select i1 %tmp202, float -1.000000e+00, float 1.000000e+00 - %tmp203 = fsub float -0.000000e+00, %tmp163 + %tmp203 = fsub float -0.000000e+00, %p2.i %tmp204 = fadd float %tmp43, %tmp203 - %tmp205 = fsub float -0.000000e+00, %tmp164 + %tmp205 = fsub float -0.000000e+00, %p2.i186 %tmp206 = fadd float %tmp44, %tmp205 - %tmp207 = fsub float -0.000000e+00, %tmp165 + %tmp207 = fsub float -0.000000e+00, %p2.i180 %tmp208 = fadd float %tmp45, %tmp207 %tmp209 = fmul float %tmp204, %tmp204 %tmp210 = fmul float %tmp206, %tmp206 @@ -876,12 +1140,13 @@ main_body: %tmp218 = fmul float %., %tmp53 %tmp219 = fmul float %arg13, %tmp46 %tmp220 = fmul float %tmp196, %tmp47 - %tmp221 = bitcast float %tmp173 to i32 - %tmp222 = bitcast float %tmp174 to i32 + %tmp221 = bitcast float %p2.i132 to i32 + %tmp222 = bitcast float %p2.i126 to i32 %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0 %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1 %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32> - %tmp225 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp224, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp224.bc = bitcast <2 x i32> %tmp224 to <2 x float> + %tmp225 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp224.bc, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp226 = extractelement <4 x float> %tmp225, i32 0 %tmp227 = extractelement <4 x float> %tmp225, i32 1 %tmp228 = extractelement <4 x float> %tmp225, i32 2 @@ -895,34 +1160,36 @@ main_body: %result.i44 = fadd float %tmp231, %one.sub.a.i43 %one.sub.a.i41 = fsub float 1.000000e+00, %tmp26 %result.i42 = fadd float %tmp232, %one.sub.a.i41 - %tmp233 = fmul float %tmp215, %tmp183 - %tmp234 = fmul float %tmp216, %tmp184 + %tmp233 = fmul float %tmp215, %p2.i72 + %tmp234 = fmul float %tmp216, %p2.i66 %tmp235 = fadd float %tmp234, %tmp233 - %tmp236 = fmul float %tmp217, %tmp185 + %tmp236 = fmul float %tmp217, %p2.i60 %tmp237 = fadd float %tmp235, %tmp236 - %tmp238 = fmul float %tmp215, %tmp186 - %tmp239 = fmul float %tmp216, %tmp187 + %tmp238 = fmul float %tmp215, %p2.i54 + %tmp239 = fmul float %tmp216, %p2.i48 %tmp240 = fadd float %tmp239, %tmp238 - %tmp241 = fmul float %tmp217, %tmp188 + %tmp241 = fmul float %tmp217, %p2.i42 %tmp242 = fadd float %tmp240, %tmp241 - %tmp243 = fmul float %tmp215, %tmp189 - %tmp244 = fmul float %tmp216, %tmp190 + %tmp243 = fmul float %tmp215, %p2.i36 + %tmp244 = fmul float %tmp216, %p2.i30 %tmp245 = fadd float %tmp244, %tmp243 - %tmp246 = fmul float %tmp217, %tmp191 + %tmp246 = fmul float %tmp217, %p2.i24 %tmp247 = fadd float %tmp245, %tmp246 - %tmp248 = call float @llvm.AMDGPU.clamp.f32(float %tmp247, float 0.000000e+00, float 1.000000e+00) + %max.0.i19 = call float @llvm.maxnum.f32(float %tmp247, float 0.000000e+00) + %clamp.i20 = call float @llvm.minnum.f32(float %max.0.i19, float 1.000000e+00) %tmp249 = fmul float %tmp213, 0x3F5A36E2E0000000 - %tmp250 = call float @llvm.AMDGPU.clamp.f32(float %tmp249, float 0.000000e+00, float 1.000000e+00) - %tmp251 = fsub float -0.000000e+00, %tmp250 + %max.0.i17 = call float @llvm.maxnum.f32(float %tmp249, float 0.000000e+00) + %clamp.i18 = call float @llvm.minnum.f32(float %max.0.i17, float 1.000000e+00) + %tmp251 = fsub float -0.000000e+00, %clamp.i18 %tmp252 = fadd float 1.000000e+00, %tmp251 - %tmp253 = call float @llvm.pow.f32(float %tmp248, float 2.500000e-01) + %tmp253 = call float @llvm.pow.f32(float %clamp.i20, float 2.500000e-01) %tmp254 = fmul float %tmp38, %tmp253 %tmp255 = fmul float %tmp237, %tmp254 %tmp256 = fmul float %tmp242, %tmp254 %tmp257 = fmul float %tmp255, %tmp229 %tmp258 = fmul float %tmp256, %tmp229 - %tmp259 = fadd float %tmp248, 0x3EE4F8B580000000 - %tmp260 = fsub float -0.000000e+00, %tmp248 + %tmp259 = fadd float %clamp.i20, 0x3EE4F8B580000000 + %tmp260 = fsub float -0.000000e+00, %clamp.i20 %tmp261 = fadd float 1.000000e+00, %tmp260 %tmp262 = fmul float 1.200000e+01, %tmp261 %tmp263 = fadd float %tmp262, 4.000000e+00 @@ -942,8 +1209,8 @@ main_body: LOOP: ; preds = %LOOP, %main_body %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %tmp288, %LOOP ] - %temp168.0 = phi float [ %tmp175, %main_body ], [ %tmp284, %LOOP ] - %temp169.0 = phi float [ %tmp176, %main_body ], [ %tmp285, %LOOP ] + %temp168.0 = phi float [ %p2.i120, %main_body ], [ %tmp284, %LOOP ] + %temp169.0 = phi float [ %p2.i114, %main_body ], [ %tmp285, %LOOP ] %temp170.0 = phi float [ %tmp252, %main_body ], [ %tmp286, %LOOP ] %tmp276 = bitcast float %temp168.0 to i32 %tmp277 = bitcast float %temp169.0 to i32 @@ -952,7 +1219,8 @@ LOOP: ; preds = %LOOP, %main_body %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2 %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3 %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32> - %tmp282 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp281, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp281.bc = bitcast <4 x i32> %tmp281 to <4 x float> + %tmp282 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp281.bc, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp283 = extractelement <4 x float> %tmp282, i32 3 %tmp284 = fadd float %temp168.0, %tmp273 %tmp285 = fadd float %temp169.0, %tmp274 @@ -979,12 +1247,12 @@ IF189: ; preds = %LOOP %tmp303 = fadd float %tmp302, %tmp284 %tmp304 = fmul float %tmp301, %tmp274 %tmp305 = fadd float %tmp304, %tmp285 - %tmp306 = fsub float -0.000000e+00, %tmp175 + %tmp306 = fsub float -0.000000e+00, %p2.i120 %tmp307 = fadd float %tmp303, %tmp306 - %tmp308 = fsub float -0.000000e+00, %tmp176 + %tmp308 = fsub float -0.000000e+00, %p2.i114 %tmp309 = fadd float %tmp305, %tmp308 - %tmp310 = fadd float %tmp175, %tmp307 - %tmp311 = fadd float %tmp176, %tmp309 + %tmp310 = fadd float %p2.i120, %tmp307 + %tmp311 = fadd float %p2.i114, %tmp309 %tmp312 = fmul float %tmp307, %tmp66 %tmp313 = fmul float %tmp309, %tmp67 %tmp314 = fmul float %tmp312, %tmp54 @@ -993,8 +1261,8 @@ IF189: ; preds = %LOOP %tmp317 = fadd float %tmp316, %tmp314 %tmp318 = fmul float %tmp313, %tmp57 %tmp319 = fadd float %tmp318, %tmp315 - %tmp320 = fadd float %tmp177, %tmp317 - %tmp321 = fadd float %tmp178, %tmp319 + %tmp320 = fadd float %p2.i108, %tmp317 + %tmp321 = fadd float %p2.i102, %tmp319 %tmp322 = fmul float %tmp312, %tmp58 %tmp323 = fmul float %tmp312, %tmp59 %tmp324 = fmul float %tmp312, %tmp60 @@ -1007,28 +1275,29 @@ IF189: ; preds = %LOOP %tmp331 = fadd float %tmp330, %tmp324 %tmp332 = fmul float %tmp313, %tmp65 %tmp333 = fadd float %tmp332, %tmp325 - %tmp334 = fadd float %tmp167, %tmp327 - %tmp335 = fadd float %tmp168, %tmp329 - %tmp336 = fadd float %tmp169, %tmp331 - %tmp337 = fadd float %tmp170, %tmp333 + %tmp334 = fadd float %p2.i168, %tmp327 + %tmp335 = fadd float %p2.i162, %tmp329 + %tmp336 = fadd float %p2.i156, %tmp331 + %tmp337 = fadd float %p2.i150, %tmp333 %tmp338 = bitcast float %tmp334 to i32 %tmp339 = bitcast float %tmp335 to i32 %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0 %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1 %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32> - %tmp342 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp341, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp343 = extractelement <4 x float> %tmp342, i32 0 - %tmp344 = extractelement <4 x float> %tmp342, i32 1 - %tmp345 = extractelement <4 x float> %tmp342, i32 2 - %tmp346 = extractelement <4 x float> %tmp342, i32 3 + %a.bc.i = bitcast <2 x i32> %tmp341 to <2 x float> + %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp343 = extractelement <4 x float> %tmp0, i32 0 + %tmp344 = extractelement <4 x float> %tmp0, i32 1 + %tmp345 = extractelement <4 x float> %tmp0, i32 2 + %tmp346 = extractelement <4 x float> %tmp0, i32 3 %tmp347 = fmul float %tmp343, %tmp22 %tmp348 = fmul float %tmp344, %tmp23 %tmp349 = fmul float %tmp345, %tmp24 %tmp350 = fmul float %tmp346, %tmp25 - %tmp351 = fmul float %tmp347, %tmp179 - %tmp352 = fmul float %tmp348, %tmp180 - %tmp353 = fmul float %tmp349, %tmp181 - %tmp354 = fmul float %tmp350, %tmp182 + %tmp351 = fmul float %tmp347, %p2.i96 + %tmp352 = fmul float %tmp348, %p2.i90 + %tmp353 = fmul float %tmp349, %p2.i84 + %tmp354 = fmul float %tmp350, %p2.i78 %tmp355 = fsub float -0.000000e+00, %tmp346 %tmp356 = fadd float 1.000000e+00, %tmp355 %tmp357 = fmul float %tmp356, %tmp48 @@ -1049,8 +1318,9 @@ IF189: ; preds = %LOOP %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0 %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1 %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32> - %tmp362 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp361, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp363 = extractelement <4 x float> %tmp362, i32 2 + %a.bc.i3 = bitcast <2 x i32> %tmp361 to <2 x float> + %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i3, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp363 = extractelement <4 x float> %tmp1, i32 2 %tmp364 = fmul float %result.i40, %result.i %tmp365 = fmul float %result.i36, %result.i44 %tmp366 = fmul float %result.i32, %result.i42 @@ -1060,11 +1330,12 @@ IF189: ; preds = %LOOP %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0 %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1 %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32> - %tmp372 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp371, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp373 = extractelement <4 x float> %tmp372, i32 0 - %tmp374 = extractelement <4 x float> %tmp372, i32 1 - %tmp375 = extractelement <4 x float> %tmp372, i32 2 - %tmp376 = extractelement <4 x float> %tmp372, i32 3 + %a.bc.i2 = bitcast <2 x i32> %tmp371 to <2 x float> + %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i2, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp373 = extractelement <4 x float> %tmp2, i32 0 + %tmp374 = extractelement <4 x float> %tmp2, i32 1 + %tmp375 = extractelement <4 x float> %tmp2, i32 2 + %tmp376 = extractelement <4 x float> %tmp2, i32 3 %tmp377 = fcmp olt float 0.000000e+00, %tmp375 %tmp378 = sext i1 %tmp377 to i32 %tmp379 = bitcast i32 %tmp378 to float @@ -1077,11 +1348,12 @@ IF189: ; preds = %LOOP %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0 %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1 %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32> - %tmp386 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp385, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp387 = extractelement <4 x float> %tmp386, i32 0 - %tmp388 = extractelement <4 x float> %tmp386, i32 1 - %tmp389 = extractelement <4 x float> %tmp386, i32 2 - %tmp390 = extractelement <4 x float> %tmp386, i32 3 + %a.bc.i1 = bitcast <2 x i32> %tmp385 to <2 x float> + %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp387 = extractelement <4 x float> %tmp3, i32 0 + %tmp388 = extractelement <4 x float> %tmp3, i32 1 + %tmp389 = extractelement <4 x float> %tmp3, i32 2 + %tmp390 = extractelement <4 x float> %tmp3, i32 3 %tmp391 = fcmp olt float 0.000000e+00, %tmp389 %tmp392 = sext i1 %tmp391 to i32 %tmp393 = bitcast i32 %tmp392 to float @@ -1107,8 +1379,8 @@ IF189: ; preds = %LOOP %tmp411 = fmul float %tmp410, %tmp35 %tmp412 = fmul float %tmp409, %tmp363 %tmp413 = fmul float %tmp411, %tmp363 - %tmp414 = call float @fabs(float %tmp405) - %tmp415 = call float @fabs(float %tmp407) + %tmp414 = call float @llvm.fabs.f32(float %tmp405) + %tmp415 = call float @llvm.fabs.f32(float %tmp407) %tmp416 = fsub float -0.000000e+00, %tmp414 %tmp417 = fadd float 1.000000e+00, %tmp416 %tmp418 = fsub float -0.000000e+00, %tmp415 @@ -1122,26 +1394,27 @@ IF189: ; preds = %LOOP %tmp426 = fadd float %tmp424, %tmp425 %tmp427 = fsub float -0.000000e+00, %tmp426 %tmp428 = fadd float 0x3FF00068E0000000, %tmp427 - %tmp429 = call float @llvm.AMDGPU.clamp.f32(float %tmp428, float 0.000000e+00, float 1.000000e+00) - %tmp430 = call float @llvm.amdgcn.rsq.f32(float %tmp429) - %tmp431 = fmul float %tmp430, %tmp429 - %tmp432 = fsub float -0.000000e+00, %tmp429 + %max.0.i15 = call float @llvm.maxnum.f32(float %tmp428, float 0.000000e+00) + %clamp.i16 = call float @llvm.minnum.f32(float %max.0.i15, float 1.000000e+00) + %tmp430 = call float @llvm.amdgcn.rsq.f32(float %clamp.i16) + %tmp431 = fmul float %tmp430, %clamp.i16 + %tmp432 = fsub float -0.000000e+00, %clamp.i16 %cmp = fcmp ogt float 0.000000e+00, %tmp432 %tmp433 = select i1 %cmp, float %tmp431, float 0.000000e+00 - %tmp434 = fmul float %tmp183, %tmp421 - %tmp435 = fmul float %tmp184, %tmp421 - %tmp436 = fmul float %tmp185, %tmp421 - %tmp437 = fmul float %tmp186, %tmp423 + %tmp434 = fmul float %p2.i72, %tmp421 + %tmp435 = fmul float %p2.i66, %tmp421 + %tmp436 = fmul float %p2.i60, %tmp421 + %tmp437 = fmul float %p2.i54, %tmp423 %tmp438 = fadd float %tmp437, %tmp434 - %tmp439 = fmul float %tmp187, %tmp423 + %tmp439 = fmul float %p2.i48, %tmp423 %tmp440 = fadd float %tmp439, %tmp435 - %tmp441 = fmul float %tmp188, %tmp423 + %tmp441 = fmul float %p2.i42, %tmp423 %tmp442 = fadd float %tmp441, %tmp436 - %tmp443 = fmul float %tmp189, %tmp433 + %tmp443 = fmul float %p2.i36, %tmp433 %tmp444 = fadd float %tmp443, %tmp438 - %tmp445 = fmul float %tmp190, %tmp433 + %tmp445 = fmul float %p2.i30, %tmp433 %tmp446 = fadd float %tmp445, %tmp440 - %tmp447 = fmul float %tmp191, %tmp433 + %tmp447 = fmul float %p2.i24, %tmp433 %tmp448 = fadd float %tmp447, %tmp442 %tmp449 = fmul float %tmp444, %tmp444 %tmp450 = fmul float %tmp446, %tmp446 @@ -1174,7 +1447,8 @@ ENDIF197: ; preds = %IF198, %IF189 %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0 %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1 %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32> - %tmp470 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp469, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp469.bc = bitcast <2 x i32> %tmp469 to <2 x float> + %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp469.bc, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp471 = extractelement <4 x float> %tmp470, i32 0 %tmp472 = extractelement <4 x float> %tmp470, i32 1 %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -1187,12 +1461,13 @@ ENDIF197: ; preds = %IF198, %IF189 %tmp480 = fadd float %tmp479, %tmp40 %tmp481 = fmul float %tmp474, %tmp41 %tmp482 = fadd float %tmp481, %tmp42 - %tmp483 = bitcast float %tmp171 to i32 - %tmp484 = bitcast float %tmp172 to i32 + %tmp483 = bitcast float %p2.i144 to i32 + %tmp484 = bitcast float %p2.i138 to i32 %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0 %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1 %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32> - %tmp487 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp486, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp486.bc = bitcast <2 x i32> %tmp486 to <2 x float> + %tmp487 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp486.bc, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp488 = extractelement <4 x float> %tmp487, i32 0 %tmp489 = extractelement <4 x float> %tmp487, i32 1 %tmp490 = extractelement <4 x float> %tmp487, i32 2 @@ -1204,11 +1479,11 @@ ENDIF197: ; preds = %IF198, %IF189 %tmp496 = fmul float %tmp489, %tmp494 %tmp497 = fmul float %tmp490, %tmp494 %tmp498 = fmul float %tmp27, %tmp495 - %tmp499 = fadd float %tmp498, %tmp192 + %tmp499 = fadd float %tmp498, %p2.i18 %tmp500 = fmul float %tmp28, %tmp496 - %tmp501 = fadd float %tmp500, %tmp193 + %tmp501 = fadd float %tmp500, %p2.i12 %tmp502 = fmul float %tmp29, %tmp497 - %tmp503 = fadd float %tmp502, %tmp194 + %tmp503 = fadd float %tmp502, %p2.i6 %tmp504 = fmul float %tmp499, %tmp482 %tmp505 = fmul float %tmp501, %tmp482 %tmp506 = fmul float %tmp503, %tmp482 @@ -1242,18 +1517,19 @@ ENDIF197: ; preds = %IF198, %IF189 %tmp534 = fadd float %tmp533, %tmp532 %tmp535 = fmul float %temp14.0, %tmp531 %tmp536 = fadd float %tmp534, %tmp535 - %tmp537 = call float @llvm.AMDGPU.clamp.f32(float %tmp536, float 0.000000e+00, float 1.000000e+00) - %tmp538 = fmul float %tmp364, %tmp537 - %tmp539 = fmul float %tmp365, %tmp537 - %tmp540 = fmul float %tmp366, %tmp537 + %max.0.i13 = call float @llvm.maxnum.f32(float %tmp536, float 0.000000e+00) + %clamp.i14 = call float @llvm.minnum.f32(float %max.0.i13, float 1.000000e+00) + %tmp538 = fmul float %tmp364, %clamp.i14 + %tmp539 = fmul float %tmp365, %clamp.i14 + %tmp540 = fmul float %tmp366, %clamp.i14 %tmp541 = fmul float %tmp538, %tmp68 %tmp542 = fmul float %tmp539, %tmp69 %tmp543 = fmul float %tmp540, %tmp70 - %tmp544 = fsub float -0.000000e+00, %tmp163 + %tmp544 = fsub float -0.000000e+00, %p2.i %tmp545 = fadd float %tmp96, %tmp544 - %tmp546 = fsub float -0.000000e+00, %tmp164 + %tmp546 = fsub float -0.000000e+00, %p2.i186 %tmp547 = fadd float %tmp97, %tmp546 - %tmp548 = fsub float -0.000000e+00, %tmp165 + %tmp548 = fsub float -0.000000e+00, %p2.i180 %tmp549 = fadd float %tmp98, %tmp548 %tmp550 = fmul float %tmp545, %tmp545 %tmp551 = fmul float %tmp547, %tmp547 @@ -1339,31 +1615,31 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %temp69.0 = phi float [ %tmp112, %ENDIF200 ], [ %.231, %ELSE214 ], [ %tmp108, %ELSE211 ] %temp70.0 = phi float [ %tmp113, %ENDIF200 ], [ %.232, %ELSE214 ], [ %tmp109, %ELSE211 ] %temp71.0 = phi float [ %tmp114, %ENDIF200 ], [ %.233, %ELSE214 ], [ %tmp110, %ELSE211 ] - %tmp602 = fmul float %tmp163, %tmp84 - %tmp603 = fmul float %tmp164, %tmp85 + %tmp602 = fmul float %p2.i, %tmp84 + %tmp603 = fmul float %p2.i186, %tmp85 %tmp604 = fadd float %tmp602, %tmp603 - %tmp605 = fmul float %tmp165, %tmp86 + %tmp605 = fmul float %p2.i180, %tmp86 %tmp606 = fadd float %tmp604, %tmp605 - %tmp607 = fmul float %tmp166, %tmp87 + %tmp607 = fmul float %p2.i174, %tmp87 %tmp608 = fadd float %tmp606, %tmp607 - %tmp609 = fmul float %tmp163, %tmp88 - %tmp610 = fmul float %tmp164, %tmp89 + %tmp609 = fmul float %p2.i, %tmp88 + %tmp610 = fmul float %p2.i186, %tmp89 %tmp611 = fadd float %tmp609, %tmp610 - %tmp612 = fmul float %tmp165, %tmp90 + %tmp612 = fmul float %p2.i180, %tmp90 %tmp613 = fadd float %tmp611, %tmp612 - %tmp614 = fmul float %tmp166, %tmp91 + %tmp614 = fmul float %p2.i174, %tmp91 %tmp615 = fadd float %tmp613, %tmp614 - %tmp616 = fmul float %tmp163, %tmp92 - %tmp617 = fmul float %tmp164, %tmp93 + %tmp616 = fmul float %p2.i, %tmp92 + %tmp617 = fmul float %p2.i186, %tmp93 %tmp618 = fadd float %tmp616, %tmp617 - %tmp619 = fmul float %tmp165, %tmp94 + %tmp619 = fmul float %p2.i180, %tmp94 %tmp620 = fadd float %tmp618, %tmp619 - %tmp621 = fmul float %tmp166, %tmp95 + %tmp621 = fmul float %p2.i174, %tmp95 %tmp622 = fadd float %tmp620, %tmp621 %tmp623 = fsub float -0.000000e+00, %tmp77 %tmp624 = fadd float 1.000000e+00, %tmp623 - %tmp625 = call float @fabs(float %tmp608) - %tmp626 = call float @fabs(float %tmp615) + %tmp625 = call float @llvm.fabs.f32(float %tmp608) + %tmp626 = call float @llvm.fabs.f32(float %tmp615) %tmp627 = fcmp oge float %tmp624, %tmp625 %tmp628 = sext i1 %tmp627 to i32 %tmp629 = bitcast i32 %tmp628 to float @@ -1389,7 +1665,8 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp649 = fadd float %temp80.0, -1.000000e+00 %tmp650 = fmul float %tmp649, %tmp76 %tmp651 = fadd float %tmp650, 1.000000e+00 - %tmp652 = call float @llvm.AMDGPU.clamp.f32(float %tmp651, float 0.000000e+00, float 1.000000e+00) + %max.0.i11 = call float @llvm.maxnum.f32(float %tmp651, float 0.000000e+00) + %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00) %tmp653 = bitcast float %tmp642 to i32 %tmp654 = bitcast float %tmp644 to i32 %tmp655 = bitcast float 0.000000e+00 to i32 @@ -1398,7 +1675,8 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2 %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3 %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32> - %tmp660 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp659, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp659.bc = bitcast <4 x i32> %tmp659 to <4 x float> + %tmp660 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp659.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp661 = extractelement <4 x float> %tmp660, i32 0 %tmp662 = extractelement <4 x float> %tmp660, i32 1 %tmp663 = bitcast float %tmp646 to i32 @@ -1408,7 +1686,8 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp667 = insertelement <4 x i32> %tmp666, i32 %tmp664, i32 1 %tmp668 = insertelement <4 x i32> %tmp667, i32 %tmp665, i32 2 %tmp669 = insertelement <4 x i32> %tmp668, i32 undef, i32 3 - %tmp670 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp669, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp669.bc = bitcast <4 x i32> %tmp669 to <4 x float> + %tmp670 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp669.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp671 = extractelement <4 x float> %tmp670, i32 0 %tmp672 = extractelement <4 x float> %tmp670, i32 1 %tmp673 = fsub float -0.000000e+00, %tmp662 @@ -1425,11 +1704,13 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp684 = fadd float %tmp683, %temp89.0 %tmp685 = fmul float %tmp640, %temp90.0 %tmp686 = fadd float %tmp685, %temp91.0 - %tmp687 = call float @llvm.AMDGPU.clamp.f32(float %tmp684, float 0.000000e+00, float 1.000000e+00) - %tmp688 = call float @llvm.AMDGPU.clamp.f32(float %tmp686, float 0.000000e+00, float 1.000000e+00) - %tmp689 = fsub float -0.000000e+00, %tmp687 + %max.0.i9 = call float @llvm.maxnum.f32(float %tmp684, float 0.000000e+00) + %clamp.i10 = call float @llvm.minnum.f32(float %max.0.i9, float 1.000000e+00) + %max.0.i7 = call float @llvm.maxnum.f32(float %tmp686, float 0.000000e+00) + %clamp.i8 = call float @llvm.minnum.f32(float %max.0.i7, float 1.000000e+00) + %tmp689 = fsub float -0.000000e+00, %clamp.i10 %tmp690 = fadd float %tmp661, %tmp689 - %tmp691 = fsub float -0.000000e+00, %tmp688 + %tmp691 = fsub float -0.000000e+00, %clamp.i8 %tmp692 = fadd float %tmp671, %tmp691 %tmp693 = fmul float %tmp661, %tmp661 %tmp694 = fmul float %tmp671, %tmp671 @@ -1461,16 +1742,17 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp719 = bitcast float %tmp718 to i32 %tmp720 = icmp ne i32 %tmp719, 0 %temp28.0 = select i1 %tmp720, float 1.000000e+00, float %tmp710 - %one.sub.a.i25 = fsub float 1.000000e+00, %tmp652 + %one.sub.a.i25 = fsub float 1.000000e+00, %clamp.i12 %one.sub.ac.i26 = fmul float %one.sub.a.i25, %.229 %mul.i27 = fmul float %temp28.0, %.229 %result.i28 = fadd float %mul.i27, %one.sub.ac.i26 %tmp721 = call float @llvm.pow.f32(float %result.i28, float %tmp75) %tmp722 = fmul float %tmp721, %tmp78 %tmp723 = fadd float %tmp722, %tmp79 - %tmp724 = call float @llvm.AMDGPU.clamp.f32(float %tmp723, float 0.000000e+00, float 1.000000e+00) - %tmp725 = fmul float %tmp724, %tmp724 - %tmp726 = fmul float 2.000000e+00, %tmp724 + %max.0.i5 = call float @llvm.maxnum.f32(float %tmp723, float 0.000000e+00) + %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00) + %tmp725 = fmul float %clamp.i6, %clamp.i6 + %tmp726 = fmul float 2.000000e+00, %clamp.i6 %tmp727 = fsub float -0.000000e+00, %tmp726 %tmp728 = fadd float 3.000000e+00, %tmp727 %tmp729 = fmul float %tmp725, %tmp728 @@ -1504,12 +1786,13 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp747 = fadd float %tmp746, %tmp745 %tmp748 = fmul float %temp14.0, %tmp217 %tmp749 = fadd float %tmp747, %tmp748 - %tmp750 = call float @fabs(float %tmp749) + %tmp750 = call float @llvm.fabs.f32(float %tmp749) %tmp751 = fmul float %tmp750, %tmp750 %tmp752 = fmul float %tmp751, %tmp49 %tmp753 = fadd float %tmp752, %tmp50 - %tmp754 = call float @llvm.AMDGPU.clamp.f32(float %tmp753, float 0.000000e+00, float 1.000000e+00) - %tmp755 = fsub float -0.000000e+00, %tmp754 + %max.0.i3 = call float @llvm.maxnum.f32(float %tmp753, float 0.000000e+00) + %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00) + %tmp755 = fsub float -0.000000e+00, %clamp.i4 %tmp756 = fadd float 1.000000e+00, %tmp755 %tmp757 = fmul float %tmp32, %tmp756 %tmp758 = fmul float %tmp32, %tmp756 @@ -1545,12 +1828,11 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp772 = select i1 %tmp771, float 6.550400e+04, float %tmp766 %tmp773 = fmul float %result.i2, %tmp51 %tmp774 = fadd float %tmp773, %tmp52 - %tmp775 = call float @llvm.AMDGPU.clamp.f32(float %tmp774, float 0.000000e+00, float 1.000000e+00) - %tmp776 = call i32 @llvm.SI.packf16(float %tmp768, float %tmp770) - %tmp777 = bitcast i32 %tmp776 to float - %tmp778 = call i32 @llvm.SI.packf16(float %tmp772, float %tmp775) - %tmp779 = bitcast i32 %tmp778 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp777, float %tmp779, float %tmp777, float %tmp779) + %max.0.i1 = call float @llvm.maxnum.f32(float %tmp774, float 0.000000e+00) + %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) + %tmp776 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp768, float %tmp770) + %tmp778 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp772, float %clamp.i2) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp776, <2 x half> %tmp778, i1 true, i1 true) #0 ret void ELSE214: ; preds = %ELSE211 @@ -1566,57 +1848,32 @@ ELSE214: ; preds = %ELSE211 br label %ENDIF209 } -; Function Attrs: readnone -declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 - - -declare float @llvm.exp2.f32(float) #2 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #2 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #2 - +declare float @llvm.exp2.f32(float) #1 +declare float @llvm.ceil.f32(float) #1 +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.pow.f32(float, float) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.amdgcn.rsq.f32(float) #1 +declare float @llvm.amdgcn.cubeid(float, float, float) #1 +declare float @llvm.amdgcn.cubesc(float, float, float) #1 +declare float @llvm.amdgcn.cubetc(float, float, float) #1 +declare float @llvm.amdgcn.cubema(float, float, float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -; Function Attrs: nounwind readonly -declare float @ceil(float) #3 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.rsq.f32(float) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 - -; Function Attrs: readnone -declare float @fabs(float) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 - - -; Function Attrs: nounwind readnone -declare float @llvm.pow.f32(float, float) #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #2 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #1 = { readnone } -attributes #2 = { nounwind readnone } -attributes #3 = { nounwind readonly } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll index 06f9277080a8..926702645d9e 100644 --- a/test/CodeGen/AMDGPU/si-spill-cf.ll +++ b/test/CodeGen/AMDGPU/si-spill-cf.ll @@ -6,270 +6,271 @@ ; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]] ; SI-NOT: v_readlane_b32 [[SAVED]] + define amdgpu_ps void @main() #0 { main_body: - %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) - %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) - %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80) - %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84) - %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88) - %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) - %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100) - %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104) - %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112) - %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116) - %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) - %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128) - %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132) - %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136) - %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144) - %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148) - %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152) - %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160) - %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164) - %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168) - %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176) - %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180) - %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184) - %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192) - %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196) - %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200) - %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208) - %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212) - %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216) - %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224) - %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228) - %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232) - %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240) - %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244) - %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248) - %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256) - %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260) - %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264) - %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272) - %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276) - %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280) - %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288) - %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292) - %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296) - %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304) - %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308) - %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312) - %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320) - %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324) - %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328) - %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336) - %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340) - %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344) - %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352) - %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356) - %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360) - %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368) - %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372) - %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376) - %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384) - %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388) - %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392) - %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400) - %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404) - %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408) - %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416) - %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420) + %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) + %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) + %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80) + %tmp3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84) + %tmp4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88) + %tmp5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) + %tmp6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100) + %tmp7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104) + %tmp8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112) + %tmp9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116) + %tmp10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) + %tmp11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128) + %tmp12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132) + %tmp13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136) + %tmp14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144) + %tmp15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148) + %tmp16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152) + %tmp17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160) + %tmp18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164) + %tmp19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168) + %tmp20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176) + %tmp21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180) + %tmp22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184) + %tmp23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192) + %tmp24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196) + %tmp25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200) + %tmp26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208) + %tmp27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212) + %tmp28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216) + %tmp29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224) + %tmp30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228) + %tmp31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232) + %tmp32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240) + %tmp33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244) + %tmp34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248) + %tmp35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256) + %tmp36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260) + %tmp37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264) + %tmp38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272) + %tmp39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276) + %tmp40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280) + %tmp41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288) + %tmp42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292) + %tmp43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296) + %tmp44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304) + %tmp45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308) + %tmp46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312) + %tmp47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320) + %tmp48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324) + %tmp49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328) + %tmp50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336) + %tmp51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340) + %tmp52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344) + %tmp53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352) + %tmp54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356) + %tmp55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360) + %tmp56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368) + %tmp57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372) + %tmp58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376) + %tmp59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384) + %tmp60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388) + %tmp61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392) + %tmp62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400) + %tmp63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404) + %tmp64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408) + %tmp65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416) + %tmp66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420) br label %LOOP LOOP: ; preds = %ENDIF2795, %main_body %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ] %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ] %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) - %67 = icmp sgt i32 %tid, 4 - br i1 %67, label %ENDLOOP, label %ENDIF + %tmp67 = icmp sgt i32 %tid, 4 + br i1 %tmp67, label %ENDLOOP, label %ENDIF ENDLOOP: ; preds = %ELSE2566, %LOOP - %one.sub.a.i = fsub float 1.000000e+00, %0 + %one.sub.a.i = fsub float 1.000000e+00, %tmp %one.sub.ac.i = fmul float %one.sub.a.i, undef %result.i = fadd float fmul (float undef, float undef), %one.sub.ac.i - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %result.i, float undef, float 1.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float %result.i, float undef, float 1.000000e+00, i1 true, i1 true) #0 ret void ENDIF: ; preds = %LOOP - %68 = fsub float %2, undef - %69 = fsub float %3, undef - %70 = fsub float %4, undef - %71 = fmul float %68, 0.000000e+00 - %72 = fmul float %69, undef - %73 = fmul float %70, undef - %74 = fsub float %6, undef - %75 = fsub float %7, undef - %76 = fmul float %74, undef - %77 = fmul float %75, 0.000000e+00 - %78 = call float @llvm.minnum.f32(float %73, float %77) - %79 = call float @llvm.maxnum.f32(float %71, float 0.000000e+00) - %80 = call float @llvm.maxnum.f32(float %72, float %76) - %81 = call float @llvm.maxnum.f32(float undef, float %78) - %82 = call float @llvm.minnum.f32(float %79, float %80) - %83 = call float @llvm.minnum.f32(float %82, float undef) - %84 = fsub float %14, undef - %85 = fsub float %15, undef - %86 = fsub float %16, undef - %87 = fmul float %84, undef - %88 = fmul float %85, undef - %89 = fmul float %86, undef - %90 = fsub float %17, undef - %91 = fsub float %18, undef - %92 = fsub float %19, undef - %93 = fmul float %90, 0.000000e+00 - %94 = fmul float %91, undef - %95 = fmul float %92, undef - %96 = call float @llvm.minnum.f32(float %88, float %94) - %97 = call float @llvm.maxnum.f32(float %87, float %93) - %98 = call float @llvm.maxnum.f32(float %89, float %95) - %99 = call float @llvm.maxnum.f32(float undef, float %96) - %100 = call float @llvm.maxnum.f32(float %99, float undef) - %101 = call float @llvm.minnum.f32(float %97, float undef) - %102 = call float @llvm.minnum.f32(float %101, float %98) - %103 = fsub float %30, undef - %104 = fsub float %31, undef - %105 = fmul float %103, 0.000000e+00 - %106 = fmul float %104, 0.000000e+00 - %107 = call float @llvm.minnum.f32(float undef, float %105) - %108 = call float @llvm.maxnum.f32(float undef, float %106) - %109 = call float @llvm.maxnum.f32(float undef, float %107) - %110 = call float @llvm.maxnum.f32(float %109, float undef) - %111 = call float @llvm.minnum.f32(float undef, float %108) - %112 = fsub float %32, undef - %113 = fsub float %33, undef - %114 = fsub float %34, undef - %115 = fmul float %112, 0.000000e+00 - %116 = fmul float %113, undef - %117 = fmul float %114, undef - %118 = fsub float %35, undef - %119 = fsub float %36, undef - %120 = fsub float %37, undef - %121 = fmul float %118, undef - %122 = fmul float %119, undef - %123 = fmul float %120, undef - %124 = call float @llvm.minnum.f32(float %115, float %121) - %125 = call float @llvm.minnum.f32(float %116, float %122) - %126 = call float @llvm.minnum.f32(float %117, float %123) - %127 = call float @llvm.maxnum.f32(float %124, float %125) - %128 = call float @llvm.maxnum.f32(float %127, float %126) - %129 = fsub float %38, undef - %130 = fsub float %39, undef - %131 = fsub float %40, undef - %132 = fmul float %129, 0.000000e+00 - %133 = fmul float %130, undef - %134 = fmul float %131, undef - %135 = fsub float %41, undef - %136 = fsub float %42, undef - %137 = fsub float %43, undef - %138 = fmul float %135, undef - %139 = fmul float %136, undef - %140 = fmul float %137, undef - %141 = call float @llvm.minnum.f32(float %132, float %138) - %142 = call float @llvm.minnum.f32(float %133, float %139) - %143 = call float @llvm.minnum.f32(float %134, float %140) - %144 = call float @llvm.maxnum.f32(float %141, float %142) - %145 = call float @llvm.maxnum.f32(float %144, float %143) - %146 = fsub float %44, undef - %147 = fsub float %45, undef - %148 = fsub float %46, undef - %149 = fmul float %146, 0.000000e+00 - %150 = fmul float %147, 0.000000e+00 - %151 = fmul float %148, undef - %152 = fsub float %47, undef - %153 = fsub float %48, undef - %154 = fsub float %49, undef - %155 = fmul float %152, undef - %156 = fmul float %153, 0.000000e+00 - %157 = fmul float %154, undef - %158 = call float @llvm.minnum.f32(float %149, float %155) - %159 = call float @llvm.minnum.f32(float %150, float %156) - %160 = call float @llvm.minnum.f32(float %151, float %157) - %161 = call float @llvm.maxnum.f32(float %158, float %159) - %162 = call float @llvm.maxnum.f32(float %161, float %160) - %163 = fsub float %50, undef - %164 = fsub float %51, undef - %165 = fsub float %52, undef - %166 = fmul float %163, undef - %167 = fmul float %164, 0.000000e+00 - %168 = fmul float %165, 0.000000e+00 - %169 = fsub float %53, undef - %170 = fsub float %54, undef - %171 = fsub float %55, undef - %172 = fdiv float 1.000000e+00, %temp18.0 - %173 = fmul float %169, undef - %174 = fmul float %170, undef - %175 = fmul float %171, %172 - %176 = call float @llvm.minnum.f32(float %166, float %173) - %177 = call float @llvm.minnum.f32(float %167, float %174) - %178 = call float @llvm.minnum.f32(float %168, float %175) - %179 = call float @llvm.maxnum.f32(float %176, float %177) - %180 = call float @llvm.maxnum.f32(float %179, float %178) - %181 = fsub float %62, undef - %182 = fsub float %63, undef - %183 = fsub float %64, undef - %184 = fmul float %181, 0.000000e+00 - %185 = fmul float %182, undef - %186 = fmul float %183, undef - %187 = fsub float %65, undef - %188 = fsub float %66, undef - %189 = fmul float %187, undef - %190 = fmul float %188, undef - %191 = call float @llvm.maxnum.f32(float %184, float %189) - %192 = call float @llvm.maxnum.f32(float %185, float %190) - %193 = call float @llvm.maxnum.f32(float %186, float undef) - %194 = call float @llvm.minnum.f32(float %191, float %192) - %195 = call float @llvm.minnum.f32(float %194, float %193) - %.temp292.7 = select i1 undef, float %162, float undef - %temp292.9 = select i1 false, float %180, float %.temp292.7 + %tmp68 = fsub float %tmp2, undef + %tmp69 = fsub float %tmp3, undef + %tmp70 = fsub float %tmp4, undef + %tmp71 = fmul float %tmp68, 0.000000e+00 + %tmp72 = fmul float %tmp69, undef + %tmp73 = fmul float %tmp70, undef + %tmp74 = fsub float %tmp6, undef + %tmp75 = fsub float %tmp7, undef + %tmp76 = fmul float %tmp74, undef + %tmp77 = fmul float %tmp75, 0.000000e+00 + %tmp78 = call float @llvm.minnum.f32(float %tmp73, float %tmp77) + %tmp79 = call float @llvm.maxnum.f32(float %tmp71, float 0.000000e+00) + %tmp80 = call float @llvm.maxnum.f32(float %tmp72, float %tmp76) + %tmp81 = call float @llvm.maxnum.f32(float undef, float %tmp78) + %tmp82 = call float @llvm.minnum.f32(float %tmp79, float %tmp80) + %tmp83 = call float @llvm.minnum.f32(float %tmp82, float undef) + %tmp84 = fsub float %tmp14, undef + %tmp85 = fsub float %tmp15, undef + %tmp86 = fsub float %tmp16, undef + %tmp87 = fmul float %tmp84, undef + %tmp88 = fmul float %tmp85, undef + %tmp89 = fmul float %tmp86, undef + %tmp90 = fsub float %tmp17, undef + %tmp91 = fsub float %tmp18, undef + %tmp92 = fsub float %tmp19, undef + %tmp93 = fmul float %tmp90, 0.000000e+00 + %tmp94 = fmul float %tmp91, undef + %tmp95 = fmul float %tmp92, undef + %tmp96 = call float @llvm.minnum.f32(float %tmp88, float %tmp94) + %tmp97 = call float @llvm.maxnum.f32(float %tmp87, float %tmp93) + %tmp98 = call float @llvm.maxnum.f32(float %tmp89, float %tmp95) + %tmp99 = call float @llvm.maxnum.f32(float undef, float %tmp96) + %tmp100 = call float @llvm.maxnum.f32(float %tmp99, float undef) + %tmp101 = call float @llvm.minnum.f32(float %tmp97, float undef) + %tmp102 = call float @llvm.minnum.f32(float %tmp101, float %tmp98) + %tmp103 = fsub float %tmp30, undef + %tmp104 = fsub float %tmp31, undef + %tmp105 = fmul float %tmp103, 0.000000e+00 + %tmp106 = fmul float %tmp104, 0.000000e+00 + %tmp107 = call float @llvm.minnum.f32(float undef, float %tmp105) + %tmp108 = call float @llvm.maxnum.f32(float undef, float %tmp106) + %tmp109 = call float @llvm.maxnum.f32(float undef, float %tmp107) + %tmp110 = call float @llvm.maxnum.f32(float %tmp109, float undef) + %tmp111 = call float @llvm.minnum.f32(float undef, float %tmp108) + %tmp112 = fsub float %tmp32, undef + %tmp113 = fsub float %tmp33, undef + %tmp114 = fsub float %tmp34, undef + %tmp115 = fmul float %tmp112, 0.000000e+00 + %tmp116 = fmul float %tmp113, undef + %tmp117 = fmul float %tmp114, undef + %tmp118 = fsub float %tmp35, undef + %tmp119 = fsub float %tmp36, undef + %tmp120 = fsub float %tmp37, undef + %tmp121 = fmul float %tmp118, undef + %tmp122 = fmul float %tmp119, undef + %tmp123 = fmul float %tmp120, undef + %tmp124 = call float @llvm.minnum.f32(float %tmp115, float %tmp121) + %tmp125 = call float @llvm.minnum.f32(float %tmp116, float %tmp122) + %tmp126 = call float @llvm.minnum.f32(float %tmp117, float %tmp123) + %tmp127 = call float @llvm.maxnum.f32(float %tmp124, float %tmp125) + %tmp128 = call float @llvm.maxnum.f32(float %tmp127, float %tmp126) + %tmp129 = fsub float %tmp38, undef + %tmp130 = fsub float %tmp39, undef + %tmp131 = fsub float %tmp40, undef + %tmp132 = fmul float %tmp129, 0.000000e+00 + %tmp133 = fmul float %tmp130, undef + %tmp134 = fmul float %tmp131, undef + %tmp135 = fsub float %tmp41, undef + %tmp136 = fsub float %tmp42, undef + %tmp137 = fsub float %tmp43, undef + %tmp138 = fmul float %tmp135, undef + %tmp139 = fmul float %tmp136, undef + %tmp140 = fmul float %tmp137, undef + %tmp141 = call float @llvm.minnum.f32(float %tmp132, float %tmp138) + %tmp142 = call float @llvm.minnum.f32(float %tmp133, float %tmp139) + %tmp143 = call float @llvm.minnum.f32(float %tmp134, float %tmp140) + %tmp144 = call float @llvm.maxnum.f32(float %tmp141, float %tmp142) + %tmp145 = call float @llvm.maxnum.f32(float %tmp144, float %tmp143) + %tmp146 = fsub float %tmp44, undef + %tmp147 = fsub float %tmp45, undef + %tmp148 = fsub float %tmp46, undef + %tmp149 = fmul float %tmp146, 0.000000e+00 + %tmp150 = fmul float %tmp147, 0.000000e+00 + %tmp151 = fmul float %tmp148, undef + %tmp152 = fsub float %tmp47, undef + %tmp153 = fsub float %tmp48, undef + %tmp154 = fsub float %tmp49, undef + %tmp155 = fmul float %tmp152, undef + %tmp156 = fmul float %tmp153, 0.000000e+00 + %tmp157 = fmul float %tmp154, undef + %tmp158 = call float @llvm.minnum.f32(float %tmp149, float %tmp155) + %tmp159 = call float @llvm.minnum.f32(float %tmp150, float %tmp156) + %tmp160 = call float @llvm.minnum.f32(float %tmp151, float %tmp157) + %tmp161 = call float @llvm.maxnum.f32(float %tmp158, float %tmp159) + %tmp162 = call float @llvm.maxnum.f32(float %tmp161, float %tmp160) + %tmp163 = fsub float %tmp50, undef + %tmp164 = fsub float %tmp51, undef + %tmp165 = fsub float %tmp52, undef + %tmp166 = fmul float %tmp163, undef + %tmp167 = fmul float %tmp164, 0.000000e+00 + %tmp168 = fmul float %tmp165, 0.000000e+00 + %tmp169 = fsub float %tmp53, undef + %tmp170 = fsub float %tmp54, undef + %tmp171 = fsub float %tmp55, undef + %tmp172 = fdiv float 1.000000e+00, %temp18.0 + %tmp173 = fmul float %tmp169, undef + %tmp174 = fmul float %tmp170, undef + %tmp175 = fmul float %tmp171, %tmp172 + %tmp176 = call float @llvm.minnum.f32(float %tmp166, float %tmp173) + %tmp177 = call float @llvm.minnum.f32(float %tmp167, float %tmp174) + %tmp178 = call float @llvm.minnum.f32(float %tmp168, float %tmp175) + %tmp179 = call float @llvm.maxnum.f32(float %tmp176, float %tmp177) + %tmp180 = call float @llvm.maxnum.f32(float %tmp179, float %tmp178) + %tmp181 = fsub float %tmp62, undef + %tmp182 = fsub float %tmp63, undef + %tmp183 = fsub float %tmp64, undef + %tmp184 = fmul float %tmp181, 0.000000e+00 + %tmp185 = fmul float %tmp182, undef + %tmp186 = fmul float %tmp183, undef + %tmp187 = fsub float %tmp65, undef + %tmp188 = fsub float %tmp66, undef + %tmp189 = fmul float %tmp187, undef + %tmp190 = fmul float %tmp188, undef + %tmp191 = call float @llvm.maxnum.f32(float %tmp184, float %tmp189) + %tmp192 = call float @llvm.maxnum.f32(float %tmp185, float %tmp190) + %tmp193 = call float @llvm.maxnum.f32(float %tmp186, float undef) + %tmp194 = call float @llvm.minnum.f32(float %tmp191, float %tmp192) + %tmp195 = call float @llvm.minnum.f32(float %tmp194, float %tmp193) + %.temp292.7 = select i1 undef, float %tmp162, float undef + %temp292.9 = select i1 false, float %tmp180, float %.temp292.7 %.temp292.9 = select i1 undef, float undef, float %temp292.9 - %196 = fcmp ogt float undef, 0.000000e+00 - %197 = fcmp olt float undef, %195 - %198 = and i1 %196, %197 - %199 = fcmp olt float undef, %.temp292.9 - %200 = and i1 %198, %199 - %temp292.11 = select i1 %200, float undef, float %.temp292.9 - %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tmp196 = fcmp ogt float undef, 0.000000e+00 + %tmp197 = fcmp olt float undef, %tmp195 + %tmp198 = and i1 %tmp196, %tmp197 + %tmp199 = fcmp olt float undef, %.temp292.9 + %tmp200 = and i1 %tmp198, %tmp199 + %temp292.11 = select i1 %tmp200, float undef, float %.temp292.9 + %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %cmp0 = icmp eq i32 %tid0, 0 br i1 %cmp0, label %IF2565, label %ELSE2566 IF2565: ; preds = %ENDIF - %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %cmp1 = icmp eq i32 %tid1, 0 br i1 %cmp1, label %ENDIF2582, label %ELSE2584 ELSE2566: ; preds = %ENDIF - %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tidf = bitcast i32 %tid2 to float - %201 = fcmp oeq float %temp292.11, %tidf - br i1 %201, label %ENDLOOP, label %ELSE2593 + %tmp201 = fcmp oeq float %temp292.11, %tidf + br i1 %tmp201, label %ENDLOOP, label %ELSE2593 ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588 %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ] - %temp18.1 = phi float [ %218, %ENDIF2588 ], [ undef, %ENDIF2594 ] - %202 = fsub float %5, undef - %203 = fmul float %202, undef - %204 = call float @llvm.maxnum.f32(float undef, float %203) - %205 = call float @llvm.minnum.f32(float %204, float undef) - %206 = call float @llvm.minnum.f32(float %205, float undef) - %207 = fcmp ogt float undef, 0.000000e+00 - %208 = fcmp olt float undef, 1.000000e+00 - %209 = and i1 %207, %208 - %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %temp18.1 = phi float [ %tmp218, %ENDIF2588 ], [ undef, %ENDIF2594 ] + %tmp202 = fsub float %tmp5, undef + %tmp203 = fmul float %tmp202, undef + %tmp204 = call float @llvm.maxnum.f32(float undef, float %tmp203) + %tmp205 = call float @llvm.minnum.f32(float %tmp204, float undef) + %tmp206 = call float @llvm.minnum.f32(float %tmp205, float undef) + %tmp207 = fcmp ogt float undef, 0.000000e+00 + %tmp208 = fcmp olt float undef, 1.000000e+00 + %tmp209 = and i1 %tmp207, %tmp208 + %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tidf3 = bitcast i32 %tid3 to float - %210 = fcmp olt float %tidf3, %206 - %211 = and i1 %209, %210 - br i1 %211, label %ENDIF2795, label %ELSE2797 + %tmp210 = fcmp olt float %tidf3, %tmp206 + %tmp211 = and i1 %tmp209, %tmp210 + br i1 %tmp211, label %ENDIF2795, label %ELSE2797 ELSE2584: ; preds = %IF2565 br label %ENDIF2582 ENDIF2582: ; preds = %ELSE2584, %IF2565 - %212 = fadd float %1, undef - %213 = fadd float 0.000000e+00, %212 - %floor = call float @llvm.floor.f32(float %213) - %214 = fsub float %213, %floor - %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tmp212 = fadd float %tmp1, undef + %tmp213 = fadd float 0.000000e+00, %tmp212 + %floor = call float @llvm.floor.f32(float %tmp213) + %tmp214 = fsub float %tmp213, %floor + %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %cmp4 = icmp eq i32 %tid4, 0 br i1 %cmp4, label %IF2589, label %ELSE2590 @@ -280,61 +281,61 @@ ELSE2590: ; preds = %ENDIF2582 br label %ENDIF2588 ENDIF2588: ; preds = %ELSE2590, %IF2589 - %215 = fsub float 1.000000e+00, %214 - %216 = call float @llvm.sqrt.f32(float %215) - %217 = fmul float %216, undef - %218 = fadd float %217, undef + %tmp215 = fsub float 1.000000e+00, %tmp214 + %tmp216 = call float @llvm.sqrt.f32(float %tmp215) + %tmp217 = fmul float %tmp216, undef + %tmp218 = fadd float %tmp217, undef br label %ENDIF2564 ELSE2593: ; preds = %ELSE2566 - %219 = fcmp oeq float %temp292.11, %81 - %220 = fcmp olt float %81, %83 - %221 = and i1 %219, %220 - br i1 %221, label %ENDIF2594, label %ELSE2596 + %tmp219 = fcmp oeq float %temp292.11, %tmp81 + %tmp220 = fcmp olt float %tmp81, %tmp83 + %tmp221 = and i1 %tmp219, %tmp220 + br i1 %tmp221, label %ENDIF2594, label %ELSE2596 ELSE2596: ; preds = %ELSE2593 - %222 = fcmp oeq float %temp292.11, %100 - %223 = fcmp olt float %100, %102 - %224 = and i1 %222, %223 - br i1 %224, label %ENDIF2594, label %ELSE2632 + %tmp222 = fcmp oeq float %temp292.11, %tmp100 + %tmp223 = fcmp olt float %tmp100, %tmp102 + %tmp224 = and i1 %tmp222, %tmp223 + br i1 %tmp224, label %ENDIF2594, label %ELSE2632 ENDIF2594: ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593 %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ] - %225 = fmul float %temp894.2, undef + %tmp225 = fmul float %temp894.2, undef br label %ENDIF2564 ELSE2632: ; preds = %ELSE2596 br i1 undef, label %ENDIF2594, label %ELSE2650 ELSE2650: ; preds = %ELSE2632 - %226 = fcmp oeq float %temp292.11, %110 - %227 = fcmp olt float %110, %111 - %228 = and i1 %226, %227 - br i1 %228, label %IF2667, label %ELSE2668 + %tmp226 = fcmp oeq float %temp292.11, %tmp110 + %tmp227 = fcmp olt float %tmp110, %tmp111 + %tmp228 = and i1 %tmp226, %tmp227 + br i1 %tmp228, label %IF2667, label %ELSE2668 IF2667: ; preds = %ELSE2650 br i1 undef, label %ENDIF2594, label %ELSE2671 ELSE2668: ; preds = %ELSE2650 - %229 = fcmp oeq float %temp292.11, %128 - %230 = fcmp olt float %128, undef - %231 = and i1 %229, %230 - br i1 %231, label %ENDIF2594, label %ELSE2686 + %tmp229 = fcmp oeq float %temp292.11, %tmp128 + %tmp230 = fcmp olt float %tmp128, undef + %tmp231 = and i1 %tmp229, %tmp230 + br i1 %tmp231, label %ENDIF2594, label %ELSE2686 ELSE2671: ; preds = %IF2667 br label %ENDIF2594 ELSE2686: ; preds = %ELSE2668 - %232 = fcmp oeq float %temp292.11, %145 - %233 = fcmp olt float %145, undef - %234 = and i1 %232, %233 - br i1 %234, label %ENDIF2594, label %ELSE2704 + %tmp232 = fcmp oeq float %temp292.11, %tmp145 + %tmp233 = fcmp olt float %tmp145, undef + %tmp234 = and i1 %tmp232, %tmp233 + br i1 %tmp234, label %ENDIF2594, label %ELSE2704 ELSE2704: ; preds = %ELSE2686 - %235 = fcmp oeq float %temp292.11, %180 - %236 = fcmp olt float %180, undef - %237 = and i1 %235, %236 - br i1 %237, label %ENDIF2594, label %ELSE2740 + %tmp235 = fcmp oeq float %temp292.11, %tmp180 + %tmp236 = fcmp olt float %tmp180, undef + %tmp237 = and i1 %tmp235, %tmp236 + br i1 %tmp237, label %ENDIF2594, label %ELSE2740 ELSE2740: ; preds = %ELSE2704 br i1 undef, label %IF2757, label %ELSE2758 @@ -349,8 +350,8 @@ ELSE2761: ; preds = %IF2757 br label %ENDIF2594 IF2775: ; preds = %ELSE2758 - %238 = fcmp olt float undef, undef - br i1 %238, label %ENDIF2594, label %ELSE2779 + %tmp238 = fcmp olt float undef, undef + br i1 %tmp238, label %ENDIF2594, label %ELSE2779 ELSE2779: ; preds = %IF2775 br i1 undef, label %ENDIF2594, label %ELSE2782 @@ -359,39 +360,39 @@ ELSE2782: ; preds = %ELSE2779 br i1 undef, label %ENDIF2594, label %ELSE2785 ELSE2785: ; preds = %ELSE2782 - %239 = fcmp olt float undef, 0.000000e+00 - br i1 %239, label %ENDIF2594, label %ELSE2788 + %tmp239 = fcmp olt float undef, 0.000000e+00 + br i1 %tmp239, label %ENDIF2594, label %ELSE2788 ELSE2788: ; preds = %ELSE2785 - %240 = fcmp olt float 0.000000e+00, undef - %.2848 = select i1 %240, float -1.000000e+00, float 1.000000e+00 + %tmp240 = fcmp olt float 0.000000e+00, undef + %.2848 = select i1 %tmp240, float -1.000000e+00, float 1.000000e+00 br label %ENDIF2594 ELSE2797: ; preds = %ENDIF2564 - %241 = fsub float %8, undef - %242 = fsub float %9, undef - %243 = fsub float %10, undef - %244 = fmul float %241, undef - %245 = fmul float %242, undef - %246 = fmul float %243, undef - %247 = fsub float %11, undef - %248 = fsub float %12, undef - %249 = fsub float %13, undef - %250 = fmul float %247, undef - %251 = fmul float %248, undef - %252 = fmul float %249, undef - %253 = call float @llvm.minnum.f32(float %244, float %250) - %254 = call float @llvm.minnum.f32(float %245, float %251) - %255 = call float @llvm.maxnum.f32(float %246, float %252) - %256 = call float @llvm.maxnum.f32(float %253, float %254) - %257 = call float @llvm.maxnum.f32(float %256, float undef) - %258 = call float @llvm.minnum.f32(float undef, float %255) - %259 = fcmp ogt float %257, 0.000000e+00 - %260 = fcmp olt float %257, 1.000000e+00 - %261 = and i1 %259, %260 - %262 = fcmp olt float %257, %258 - %263 = and i1 %261, %262 - br i1 %263, label %ENDIF2795, label %ELSE2800 + %tmp241 = fsub float %tmp8, undef + %tmp242 = fsub float %tmp9, undef + %tmp243 = fsub float %tmp10, undef + %tmp244 = fmul float %tmp241, undef + %tmp245 = fmul float %tmp242, undef + %tmp246 = fmul float %tmp243, undef + %tmp247 = fsub float %tmp11, undef + %tmp248 = fsub float %tmp12, undef + %tmp249 = fsub float %tmp13, undef + %tmp250 = fmul float %tmp247, undef + %tmp251 = fmul float %tmp248, undef + %tmp252 = fmul float %tmp249, undef + %tmp253 = call float @llvm.minnum.f32(float %tmp244, float %tmp250) + %tmp254 = call float @llvm.minnum.f32(float %tmp245, float %tmp251) + %tmp255 = call float @llvm.maxnum.f32(float %tmp246, float %tmp252) + %tmp256 = call float @llvm.maxnum.f32(float %tmp253, float %tmp254) + %tmp257 = call float @llvm.maxnum.f32(float %tmp256, float undef) + %tmp258 = call float @llvm.minnum.f32(float undef, float %tmp255) + %tmp259 = fcmp ogt float %tmp257, 0.000000e+00 + %tmp260 = fcmp olt float %tmp257, 1.000000e+00 + %tmp261 = and i1 %tmp259, %tmp260 + %tmp262 = fcmp olt float %tmp257, %tmp258 + %tmp263 = and i1 %tmp261, %tmp262 + br i1 %tmp263, label %ENDIF2795, label %ELSE2800 ENDIF2795: ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564 br label %LOOP @@ -400,53 +401,53 @@ ELSE2800: ; preds = %ELSE2797 br i1 undef, label %ENDIF2795, label %ELSE2803 ELSE2803: ; preds = %ELSE2800 - %264 = fsub float %20, undef - %265 = fsub float %21, undef - %266 = fsub float %22, undef - %267 = fmul float %264, undef - %268 = fmul float %265, undef - %269 = fmul float %266, 0.000000e+00 - %270 = fsub float %23, undef - %271 = fsub float %24, undef - %272 = fsub float %25, undef - %273 = fmul float %270, undef - %274 = fmul float %271, undef - %275 = fmul float %272, undef - %276 = call float @llvm.minnum.f32(float %267, float %273) - %277 = call float @llvm.maxnum.f32(float %268, float %274) - %278 = call float @llvm.maxnum.f32(float %269, float %275) - %279 = call float @llvm.maxnum.f32(float %276, float undef) - %280 = call float @llvm.maxnum.f32(float %279, float undef) - %281 = call float @llvm.minnum.f32(float undef, float %277) - %282 = call float @llvm.minnum.f32(float %281, float %278) - %283 = fcmp ogt float %280, 0.000000e+00 - %284 = fcmp olt float %280, 1.000000e+00 - %285 = and i1 %283, %284 - %286 = fcmp olt float %280, %282 - %287 = and i1 %285, %286 - br i1 %287, label %ENDIF2795, label %ELSE2806 + %tmp264 = fsub float %tmp20, undef + %tmp265 = fsub float %tmp21, undef + %tmp266 = fsub float %tmp22, undef + %tmp267 = fmul float %tmp264, undef + %tmp268 = fmul float %tmp265, undef + %tmp269 = fmul float %tmp266, 0.000000e+00 + %tmp270 = fsub float %tmp23, undef + %tmp271 = fsub float %tmp24, undef + %tmp272 = fsub float %tmp25, undef + %tmp273 = fmul float %tmp270, undef + %tmp274 = fmul float %tmp271, undef + %tmp275 = fmul float %tmp272, undef + %tmp276 = call float @llvm.minnum.f32(float %tmp267, float %tmp273) + %tmp277 = call float @llvm.maxnum.f32(float %tmp268, float %tmp274) + %tmp278 = call float @llvm.maxnum.f32(float %tmp269, float %tmp275) + %tmp279 = call float @llvm.maxnum.f32(float %tmp276, float undef) + %tmp280 = call float @llvm.maxnum.f32(float %tmp279, float undef) + %tmp281 = call float @llvm.minnum.f32(float undef, float %tmp277) + %tmp282 = call float @llvm.minnum.f32(float %tmp281, float %tmp278) + %tmp283 = fcmp ogt float %tmp280, 0.000000e+00 + %tmp284 = fcmp olt float %tmp280, 1.000000e+00 + %tmp285 = and i1 %tmp283, %tmp284 + %tmp286 = fcmp olt float %tmp280, %tmp282 + %tmp287 = and i1 %tmp285, %tmp286 + br i1 %tmp287, label %ENDIF2795, label %ELSE2806 ELSE2806: ; preds = %ELSE2803 - %288 = fsub float %26, undef - %289 = fsub float %27, undef - %290 = fsub float %28, undef - %291 = fmul float %288, undef - %292 = fmul float %289, 0.000000e+00 - %293 = fmul float %290, undef - %294 = fsub float %29, undef - %295 = fmul float %294, undef - %296 = call float @llvm.minnum.f32(float %291, float %295) - %297 = call float @llvm.minnum.f32(float %292, float undef) - %298 = call float @llvm.maxnum.f32(float %293, float undef) - %299 = call float @llvm.maxnum.f32(float %296, float %297) - %300 = call float @llvm.maxnum.f32(float %299, float undef) - %301 = call float @llvm.minnum.f32(float undef, float %298) - %302 = fcmp ogt float %300, 0.000000e+00 - %303 = fcmp olt float %300, 1.000000e+00 - %304 = and i1 %302, %303 - %305 = fcmp olt float %300, %301 - %306 = and i1 %304, %305 - br i1 %306, label %ENDIF2795, label %ELSE2809 + %tmp288 = fsub float %tmp26, undef + %tmp289 = fsub float %tmp27, undef + %tmp290 = fsub float %tmp28, undef + %tmp291 = fmul float %tmp288, undef + %tmp292 = fmul float %tmp289, 0.000000e+00 + %tmp293 = fmul float %tmp290, undef + %tmp294 = fsub float %tmp29, undef + %tmp295 = fmul float %tmp294, undef + %tmp296 = call float @llvm.minnum.f32(float %tmp291, float %tmp295) + %tmp297 = call float @llvm.minnum.f32(float %tmp292, float undef) + %tmp298 = call float @llvm.maxnum.f32(float %tmp293, float undef) + %tmp299 = call float @llvm.maxnum.f32(float %tmp296, float %tmp297) + %tmp300 = call float @llvm.maxnum.f32(float %tmp299, float undef) + %tmp301 = call float @llvm.minnum.f32(float undef, float %tmp298) + %tmp302 = fcmp ogt float %tmp300, 0.000000e+00 + %tmp303 = fcmp olt float %tmp300, 1.000000e+00 + %tmp304 = and i1 %tmp302, %tmp303 + %tmp305 = fcmp olt float %tmp300, %tmp301 + %tmp306 = and i1 %tmp304, %tmp305 + br i1 %tmp306, label %ENDIF2795, label %ELSE2809 ELSE2809: ; preds = %ELSE2806 br i1 undef, label %ENDIF2795, label %ELSE2812 @@ -461,53 +462,42 @@ ELSE2818: ; preds = %ELSE2815 br i1 undef, label %ENDIF2795, label %ELSE2821 ELSE2821: ; preds = %ELSE2818 - %307 = fsub float %56, undef - %308 = fsub float %57, undef - %309 = fsub float %58, undef - %310 = fmul float %307, undef - %311 = fmul float %308, 0.000000e+00 - %312 = fmul float %309, undef - %313 = fsub float %59, undef - %314 = fsub float %60, undef - %315 = fsub float %61, undef - %316 = fmul float %313, undef - %317 = fmul float %314, undef - %318 = fmul float %315, undef - %319 = call float @llvm.maxnum.f32(float %310, float %316) - %320 = call float @llvm.maxnum.f32(float %311, float %317) - %321 = call float @llvm.maxnum.f32(float %312, float %318) - %322 = call float @llvm.minnum.f32(float %319, float %320) - %323 = call float @llvm.minnum.f32(float %322, float %321) - %324 = fcmp ogt float undef, 0.000000e+00 - %325 = fcmp olt float undef, 1.000000e+00 - %326 = and i1 %324, %325 - %327 = fcmp olt float undef, %323 - %328 = and i1 %326, %327 - br i1 %328, label %ENDIF2795, label %ELSE2824 + %tmp307 = fsub float %tmp56, undef + %tmp308 = fsub float %tmp57, undef + %tmp309 = fsub float %tmp58, undef + %tmp310 = fmul float %tmp307, undef + %tmp311 = fmul float %tmp308, 0.000000e+00 + %tmp312 = fmul float %tmp309, undef + %tmp313 = fsub float %tmp59, undef + %tmp314 = fsub float %tmp60, undef + %tmp315 = fsub float %tmp61, undef + %tmp316 = fmul float %tmp313, undef + %tmp317 = fmul float %tmp314, undef + %tmp318 = fmul float %tmp315, undef + %tmp319 = call float @llvm.maxnum.f32(float %tmp310, float %tmp316) + %tmp320 = call float @llvm.maxnum.f32(float %tmp311, float %tmp317) + %tmp321 = call float @llvm.maxnum.f32(float %tmp312, float %tmp318) + %tmp322 = call float @llvm.minnum.f32(float %tmp319, float %tmp320) + %tmp323 = call float @llvm.minnum.f32(float %tmp322, float %tmp321) + %tmp324 = fcmp ogt float undef, 0.000000e+00 + %tmp325 = fcmp olt float undef, 1.000000e+00 + %tmp326 = and i1 %tmp324, %tmp325 + %tmp327 = fcmp olt float undef, %tmp323 + %tmp328 = and i1 %tmp326, %tmp327 + br i1 %tmp328, label %ENDIF2795, label %ELSE2824 ELSE2824: ; preds = %ELSE2821 %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00 br label %ENDIF2795 } -declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone declare float @llvm.floor.f32(float) #1 - -; Function Attrs: nounwind readnone declare float @llvm.sqrt.f32(float) #1 - -; Function Attrs: nounwind readnone declare float @llvm.minnum.f32(float, float) #1 - -; Function Attrs: nounwind readnone declare float @llvm.maxnum.f32(float, float) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index 062f5245af10..114c97b61bd4 100644 --- a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -8,7 +8,7 @@ ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12 +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16 ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] ; SGPR-NEXT: s_nop 4 @@ -16,15 +16,15 @@ ; Make sure scratch wave offset register is correctly incremented and ; then restored. -; SMEM: s_mov_b32 m0, s[[OFF]]{{$}} +; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Spill -; SMEM: s_mov_b32 m0, s[[OFF]]{{$}} +; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}} ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Reload ; SMEM: s_dcache_wb ; ALL: s_endpgm -define void @test(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" () diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 4beefb047f22..8a4cee264fd8 100644 --- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2 ; FUNC-LABEL: @reorder_local_load_global_store_local_load ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3 ; CI: buffer_store_dword -define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 @@ -33,7 +33,7 @@ define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 ; CI: buffer_store_dword ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 -define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 @@ -53,7 +53,7 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: buffer_store_dword -define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 @@ -77,7 +77,7 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3 ; CI: buffer_store_dword -define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 @@ -100,7 +100,7 @@ define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3 ; CI: ds_write_b32 ; CI: buffer_store_dword -define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { +define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 @@ -122,7 +122,7 @@ define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* % ; CI: s_load_dword ; CI: ds_write_b32 ; CI: buffer_store_dword -define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { +define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 @@ -141,7 +141,7 @@ define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 ; CI: buffer_load_dword ; CI: buffer_load_dword ; CI: buffer_store_dword -define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { +define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3 @@ -157,12 +157,11 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, ; FUNC-LABEL: @reorder_local_offsets ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102 -; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408 +; CI-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100 +; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408 ; CI: buffer_store_dword ; CI: s_endpgm -define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { +define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3 %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100 %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102 @@ -181,14 +180,14 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa } ; FUNC-LABEL: @reorder_global_offsets -; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408 -; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408 -; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 +; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408 +; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 +; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408 +; CI: buffer_store_dword ; CI: s_endpgm -define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { +define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3 %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100 %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102 @@ -222,7 +221,7 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp ; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:36{{$}} ; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}} -define void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 { +define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 diff --git a/test/CodeGen/AMDGPU/si-vector-hang.ll b/test/CodeGen/AMDGPU/si-vector-hang.ll index dd8783df5c3c..7990990478af 100644 --- a/test/CodeGen/AMDGPU/si-vector-hang.ll +++ b/test/CodeGen/AMDGPU/si-vector-hang.ll @@ -12,7 +12,7 @@ ; CHECK: buffer_store_byte ; ModuleID = 'radeon' -define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { +define amdgpu_kernel void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { entry: %0 = load i8, i8 addrspace(1)* %in0, align 1 %1 = insertelement <8 x i8> undef, i8 %0, i32 0 diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll index 875351c59961..3e452c214e98 100644 --- a/test/CodeGen/AMDGPU/sign_extend.ll +++ b/test/CodeGen/AMDGPU/sign_extend.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 ; GCN: s_endpgm -define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i32 store i32 %sext, i32 addrspace(1)* %out, align 4 @@ -14,7 +14,7 @@ define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; GCN-LABEL: {{^}}test_s_sext_i32_to_i64: ; GCN: s_ashr_i32 ; GCN: s_endpg -define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { +define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { entry: %mul = mul i32 %a, %b %add = add i32 %mul, %c @@ -28,7 +28,7 @@ entry: ; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}} ; GCN: s_endpgm -define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 @@ -38,7 +38,7 @@ define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; GCN-LABEL: {{^}}s_sext_i32_to_i64: ; GCN: s_ashr_i32 ; GCN: s_endpgm -define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { %sext = sext i32 %a to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 ret void @@ -47,7 +47,7 @@ define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { ; GCN-LABEL: {{^}}v_sext_i32_to_i64: ; GCN: v_ashr ; GCN: s_endpgm -define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %val = load i32, i32 addrspace(1)* %in, align 4 %sext = sext i32 %val to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 @@ -56,7 +56,7 @@ define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) no ; GCN-LABEL: {{^}}s_sext_i16_to_i64: ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000 -define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { +define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { %sext = sext i16 %a to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 ret void @@ -65,7 +65,7 @@ define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { ; GCN-LABEL: {{^}}s_sext_i1_to_i16: ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 ; GCN-NEXT: buffer_store_short [[RESULT]] -define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i16 store i16 %sext, i16 addrspace(1)* %out @@ -79,7 +79,7 @@ define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and: ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 ; GCN-NEXT: buffer_store_short [[RESULT]] -define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { +define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { %cmp0 = icmp eq i32 %a, %b %cmp1 = icmp eq i32 %c, %d %cmp = and i1 %cmp0, %cmp1 @@ -91,7 +91,7 @@ define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i ; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and: ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 ; GCN-NEXT: buffer_store_short [[RESULT]] -define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { +define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %cmp0 = icmp eq i32 %a, %tid %cmp1 = icmp eq i32 %b, %c @@ -130,7 +130,7 @@ define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i ; GCN-DAG: buffer_store_dword [[VEXT3]] ; GCN: s_endpgm -define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { %cast = bitcast i32 %a to <4 x i8> %ext = sext <4 x i8> %cast to <4 x i32> %elt0 = extractelement <4 x i32> %ext, i32 0 @@ -162,7 +162,7 @@ define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; GCN: buffer_store_dword [[EXT1]] ; GCN: buffer_store_dword [[EXT2]] ; GCN: buffer_store_dword [[EXT3]] -define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %a = load i32, i32 addrspace(1)* %in %cast = bitcast i32 %a to <4 x i8> %ext = sext <4 x i8> %cast to <4 x i32> @@ -184,7 +184,7 @@ define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; GCN-DAG: s_sext_i32_i16 ; GCN-DAG: s_sext_i32_i16 ; GCN: s_endpgm -define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { +define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { %cast = bitcast i64 %a to <4 x i16> %ext = sext <4 x i16> %cast to <4 x i32> %elt0 = extractelement <4 x i32> %ext, i32 0 @@ -206,7 +206,7 @@ define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN: s_endpgm -define void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %a = load i64, i64 addrspace(1)* %in %cast = bitcast i64 %a to <4 x i16> %ext = sext <4 x i16> %cast to <4 x i32> diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index 68dc3c6ccd24..f98a716b4fd1 100644 --- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; SI-LABEL: {{^}}sint_to_fp_i32_to_f64 ; SI: v_cvt_f64_i32_e32 -define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { %result = sitofp i32 %in to double store double %result, double addrspace(1)* %out ret void @@ -19,7 +19,7 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}} ; SI: s_endpgm -define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 %fp = sitofp i1 %cmp to double store double %fp, double addrspace(1)* %out, align 4 @@ -31,14 +31,14 @@ define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { ; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) { +define amdgpu_kernel void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) { %fp = sitofp i1 %in to double store double %fp, double addrspace(1)* %out, align 8 ret void } ; SI-LABEL: @s_sint_to_fp_i64_to_f64 -define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { %result = sitofp i64 %in to double store double %result, double addrspace(1)* %out ret void @@ -51,7 +51,7 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %val = load i64, i64 addrspace(1)* %gep, align 8 diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll index 5df8105116cc..04cd199b81ae 100644 --- a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -4,7 +4,7 @@ ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f16: -define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { %result = sitofp i64 %in to half store half %result, half addrspace(1)* %out ret void @@ -28,7 +28,7 @@ define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], ; GCN: v_cvt_f16_f32_e32 [[SIGN_SEL_F16:v[0-9]+]], [[SIGN_SEL]] ; GCN: {{buffer|flat}}_store_short {{.*}}[[SIGN_SEL_F16]] -define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -39,7 +39,7 @@ define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* } ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f32: -define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { %result = sitofp i64 %in to float store float %result, float addrspace(1)* %out ret void @@ -62,7 +62,7 @@ define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], ; GCN: {{buffer|flat}}_store_dword {{.*}}[[SIGN_SEL]] -define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -74,14 +74,14 @@ define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f32: ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1, -define void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ +define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ %result = sitofp <2 x i64> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f32: -define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid @@ -93,14 +93,14 @@ define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i6 ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f16: ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1, -define void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{ +define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{ %result = sitofp <2 x i64> %in to <2 x half> store <2 x half> %result, <2 x half> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f16: -define void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll index 4c8fea12bada..8e85d9998597 100644 --- a/test/CodeGen/AMDGPU/sint_to_fp.ll +++ b/test/CodeGen/AMDGPU/sint_to_fp.ll @@ -6,7 +6,7 @@ ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z -define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { %result = sitofp i32 %in to float store float %result, float addrspace(1)* %out ret void @@ -16,7 +16,7 @@ define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{v[0-9]+$}} ; R600: INT_TO_FLT -define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -32,7 +32,7 @@ define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X -define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{ +define amdgpu_kernel void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{ %result = sitofp <2 x i32> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void @@ -49,7 +49,7 @@ define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) # ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %value = load <4 x i32>, <4 x i32> addrspace(1) * %in %result = sitofp <4 x i32> %value to <4 x float> store <4 x float> %result, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i3 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid @@ -81,7 +81,7 @@ define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrsp ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 { %cmp = icmp eq i32 %in, 0 %fp = uitofp i1 %cmp to float store float %fp, float addrspace(1)* %out @@ -92,7 +92,7 @@ define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 { ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0 ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 { +define amdgpu_kernel void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 { %fp = sitofp i1 %in to float store float %fp, float addrspace(1)* %out ret void @@ -105,7 +105,7 @@ define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 { ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]] ; SI: s_endpgm -define void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid diff --git a/test/CodeGen/AMDGPU/sitofp.f16.ll b/test/CodeGen/AMDGPU/sitofp.f16.ll index 1395fa2bfea0..574d1c0b2c78 100644 --- a/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -7,7 +7,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sitofp_i16_to_f16( +define amdgpu_kernel void @sitofp_i16_to_f16( half addrspace(1)* %r, i16 addrspace(1)* %a) { entry: @@ -23,7 +23,7 @@ entry: ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @sitofp_i32_to_f16( +define amdgpu_kernel void @sitofp_i32_to_f16( half addrspace(1)* %r, i32 addrspace(1)* %a) { entry: @@ -37,15 +37,24 @@ entry: ; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16 ; GCN: buffer_load_dword -; GCN: v_cvt_f32_i32_e32 -; GCN: v_cvt_f32_i32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-DAG: v_lshlrev_b32_e32 -; GCN-DAG: v_or_b32_e32 -; GCN: buffer_store_dword -; GCN: s_endpgm -define void @sitofp_v2i16_to_v2f16( + +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI-DAG: v_lshlrev_b32_e32 +; SI: v_or_b32_e32 + +; VI-DAG: v_cvt_f32_i32_sdwa +; VI-DAG: v_cvt_f32_i32_sdwa +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI: v_or_b32_e32 + +; GCN: buffer_store_dword +; GCN: s_endpgm + +define amdgpu_kernel void @sitofp_v2i16_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i16> addrspace(1)* %a) { entry: @@ -56,17 +65,24 @@ entry: } ; GCN-LABEL: {{^}}sitofp_v2i32_to_v2f16 -; GCN: buffer_load_dwordx2 -; GCN: v_cvt_f32_i32_e32 -; GCN: v_cvt_f32_i32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-DAG: v_and_b32_e32 -; GCN-DAG: v_lshlrev_b32_e32 -; GCN-DAG: v_or_b32_e32 -; GCN: buffer_store_dword -; GCN: s_endpgm -define void @sitofp_v2i32_to_v2f16( +; GCN: buffer_load_dwordx2 + +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI-DAG: v_lshlrev_b32_e32 +; SI: v_or_b32_e32 + +; VI-DAG: v_cvt_f32_i32_e32 +; VI-DAG: v_cvt_f32_i32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI: v_or_b32_e32 + +; GCN: buffer_store_dword +; GCN: s_endpgm +define amdgpu_kernel void @sitofp_v2i32_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i32> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll index 60cee7a3499e..3f53572ab440 100644 --- a/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -357,7 +357,7 @@ bb7: ; preds = %bb4 ; CHECK: [[END]]: ; CHECK: s_or_b64 exec, exec ; CHECK: s_endpgm -define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x i32> %arg2) #0 { +define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x float> %arg2) #0 { bb: %tmp = fcmp ult float %arg1, 0.000000e+00 br i1 %tmp, label %bb3, label %bb4 @@ -367,7 +367,7 @@ bb3: ; preds = %bb br label %bb4 bb4: ; preds = %bb3, %bb - %tmp5 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp6 = extractelement <4 x float> %tmp5, i32 0 %tmp7 = fcmp une float %tmp6, 0.000000e+00 br i1 %tmp7, label %bb8, label %bb9 @@ -380,9 +380,8 @@ bb9: ; preds = %bb4 ret void } +declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 declare void @llvm.AMDGPU.kill(float) #0 -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll index 985c73904f43..8665ab697265 100644 --- a/test/CodeGen/AMDGPU/smed3.ll +++ b/test/CodeGen/AMDGPU/smed3.ll @@ -1,12 +1,13 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32: ; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 -define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -24,8 +25,8 @@ define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a ; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32: ; GCN: v_max_i32 ; GCN: v_min_i32 -define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -44,8 +45,8 @@ define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32: ; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} ; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} -define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -63,8 +64,8 @@ define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32: ; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} ; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} -define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -82,8 +83,8 @@ define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64: ; GCN: v_cmp_lt_i64 ; GCN: v_cmp_gt_i64 -define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid %a = load i64, i64 addrspace(1)* %gep0 @@ -99,9 +100,10 @@ define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a } ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16: -; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 -define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define amdgpu_kernel void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid %a = load i16, i16 addrspace(1)* %gep0 @@ -172,7 +174,7 @@ define internal i8 @smax8(i8 %x, i8 %y) #2 { ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -184,7 +186,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_1: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -196,7 +198,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_2: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -208,7 +210,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_3: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -220,7 +222,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_4: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -232,7 +234,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_5: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -244,7 +246,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_6: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -256,7 +258,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_7: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -268,7 +270,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_8: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -280,7 +282,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_9: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -292,7 +294,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_10: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -304,7 +306,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_11: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -316,7 +318,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_12: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -328,7 +330,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_13: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -340,7 +342,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_14: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -352,7 +354,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_15: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %y, i32 %x) %tmp1 = call i32 @smax(i32 %y, i32 %x) @@ -362,12 +364,13 @@ bb: ret void } +; FIXME: Should keep scalar or not promote ; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0: ; GCN: s_sext_i32_i16 ; GCN: s_sext_i32_i16 ; GCN: s_sext_i32_i16 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { bb: %tmp0 = call i16 @smin16(i16 %x, i16 %y) %tmp1 = call i16 @smax16(i16 %x, i16 %y) @@ -382,7 +385,7 @@ bb: ; GCN: s_sext_i32_i8 ; GCN: s_sext_i32_i8 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { bb: %tmp0 = call i8 @smin8(i8 %x, i8 %y) %tmp1 = call i8 @smax8(i8 %x, i8 %y) @@ -394,7 +397,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_0: ; GCN-NOT: v_med3_i32 -define void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -407,7 +410,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_1: ; GCN-NOT: v_med3_i32 -define void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -420,7 +423,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_2: ; GCN-NOT: v_med3_i32 -define void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -433,7 +436,7 @@ bb: ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_result: ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @smin(i32 %x, i32 %y) %tmp1 = call i32 @smax(i32 %x, i32 %y) @@ -444,6 +447,35 @@ bb: ret void } +; GCN-LABEL: {{^}}v_test_smed3_i16_pat_0: +; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; FIXME: VI not matching med3 +; VI: v_min_i16 +; VI: v_max_i16 +; VI: v_min_i16 +; VI: v_max_i16 + +; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid + %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3 + %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8 + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %x = load i16, i16 addrspace(1)* %gep0 + %y = load i16, i16 addrspace(1)* %gep1 + %z = load i16, i16 addrspace(1)* %gep2 + + %tmp0 = call i16 @smin16(i16 %x, i16 %y) + %tmp1 = call i16 @smax16(i16 %x, i16 %y) + %tmp2 = call i16 @smin16(i16 %tmp1, i16 %z) + %tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2) + store i16 %tmp3, i16 addrspace(1)* %out.gep + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind readnone alwaysinline } diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll index ce5d92451647..827d672022eb 100644 --- a/test/CodeGen/AMDGPU/sminmax.ll +++ b/test/CodeGen/AMDGPU/sminmax.ll @@ -7,7 +7,7 @@ ; GCN: s_add_i32 ; EG: MAX_INT -define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind { +define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind { %neg = sub i32 0, %val %cond = icmp sgt i32 %val, %neg %res = select i1 %cond, i32 %val, i32 %neg @@ -22,7 +22,7 @@ define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind { ; GCN: v_add_i32 ; EG: MAX_INT -define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { +define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { %val = load i32, i32 addrspace(1)* %src, align 4 %neg = sub i32 0, %val %cond = icmp sgt i32 %val, %neg @@ -36,7 +36,7 @@ define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]] ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]] -define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { +define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { %val = load i32, i32 addrspace(1)* %src, align 4 %neg = sub i32 0, %val %cond = icmp sgt i32 %val, %neg @@ -54,7 +54,7 @@ define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %sr ; EG: MAX_INT ; EG: MAX_INT -define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind { +define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind { %z0 = insertelement <2 x i32> undef, i32 0, i32 0 %z1 = insertelement <2 x i32> %z0, i32 0, i32 1 %t0 = insertelement <2 x i32> undef, i32 2, i32 0 @@ -79,7 +79,7 @@ define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind ; EG: MAX_INT ; EG: MAX_INT -define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind { +define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind { %z0 = insertelement <2 x i32> undef, i32 0, i32 0 %z1 = insertelement <2 x i32> %z0, i32 0, i32 1 %t0 = insertelement <2 x i32> undef, i32 2, i32 0 @@ -109,7 +109,7 @@ define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* % ; EG: MAX_INT ; EG: MAX_INT ; EG: MAX_INT -define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind { +define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind { %z0 = insertelement <4 x i32> undef, i32 0, i32 0 %z1 = insertelement <4 x i32> %z0, i32 0, i32 1 %z2 = insertelement <4 x i32> %z1, i32 0, i32 2 @@ -146,7 +146,7 @@ define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind ; EG: MAX_INT ; EG: MAX_INT ; EG: MAX_INT -define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind { +define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind { %z0 = insertelement <4 x i32> undef, i32 0, i32 0 %z1 = insertelement <4 x i32> %z0, i32 0, i32 1 %z2 = insertelement <4 x i32> %z1, i32 0, i32 2 @@ -170,7 +170,7 @@ define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* % ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]] ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]] -define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind { +define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind { %cond0 = icmp sgt i32 %val0, %val1 %sel0 = select i1 %cond0, i32 %val0, i32 %val1 %sel1 = select i1 %cond0, i32 %val1, i32 %val0 @@ -186,7 +186,7 @@ define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]] ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]] -define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { +define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { %val0 = load volatile i32, i32 addrspace(1)* %ptr0 %val1 = load volatile i32, i32 addrspace(1)* %ptr1 @@ -208,7 +208,7 @@ define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 ; GCN-DAG: s_max_i32 ; GCN-DAG: s_max_i32 ; GCN-DAG: s_max_i32 -define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind { +define amdgpu_kernel void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind { %cond0 = icmp sgt <4 x i32> %val0, %val1 %sel0 = select <4 x i1> %cond0, <4 x i32> %val0, <4 x i32> %val1 %sel1 = select <4 x i1> %cond0, <4 x i32> %val1, <4 x i32> %val0 @@ -223,7 +223,7 @@ define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace( ; GCN-DAG: v_cndmask_b32_e32 ; GCN-DAG: v_cndmask_b32_e32 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc -define void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { +define amdgpu_kernel void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { %val0 = load volatile i32, i32 addrspace(1)* %ptr0 %val1 = load volatile i32, i32 addrspace(1)* %ptr1 diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll new file mode 100644 index 000000000000..4e093cdece21 --- /dev/null +++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -0,0 +1,224 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s + +; GCN-LABEL: {{^}}s_abs_v2i16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] +; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 + +; VI: v_sub_i32_e32 +; VI-DAG: v_sub_i32_e32 +; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_add_i32_e32 +; VI: v_add_i32_e32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +; CI: v_sub_i32_e32 +; CI-DAG: v_sub_i32_e32 +; CI: v_bfe_i32 +; CI-DAG: v_bfe_i32 +; CI-DAG: v_add_i32_e32 +; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 +; CI: v_add_i32_e32 +; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, +; CI: v_or_b32_e32 +define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { + %neg = sub <2 x i16> zeroinitializer, %val + %cond = icmp sgt <2 x i16> %val, %neg + %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg + %res2 = add <2 x i16> %res, + store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_abs_v2i16: +; GFX9: flat_load_dword [[VAL:v[0-9]+]] +; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] +; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 + +; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} +; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} +; VI-NOT: v_and_b32 +; VI: v_or_b32_e32 +define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %val = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in, align 4 + %neg = sub <2 x i16> zeroinitializer, %val + %cond = icmp sgt <2 x i16> %val, %neg + %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg + %res2 = add <2 x i16> %res, + store <2 x i16> %res2, <2 x i16> addrspace(1)* %gep.out, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_abs_v2i16_2: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] +; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 +define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { + %z0 = insertelement <2 x i16> undef, i16 0, i16 0 + %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 + %t0 = insertelement <2 x i16> undef, i16 2, i16 0 + %t1 = insertelement <2 x i16> %t0, i16 2, i16 1 + %neg = sub <2 x i16> %z1, %val + %cond = icmp sgt <2 x i16> %val, %neg + %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg + %res2 = add <2 x i16> %res, %t1 + store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_abs_v2i16_2: +; GFX9: buffer_load_dword [[VAL:v[0-9]+]] +; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] +; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 +define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { + %z0 = insertelement <2 x i16> undef, i16 0, i16 0 + %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 + %t0 = insertelement <2 x i16> undef, i16 2, i16 0 + %t1 = insertelement <2 x i16> %t0, i16 2, i16 1 + %val = load <2 x i16>, <2 x i16> addrspace(1)* %src, align 4 + %neg = sub <2 x i16> %z1, %val + %cond = icmp sgt <2 x i16> %val, %neg + %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg + %res2 = add <2 x i16> %res, %t1 + store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_abs_v4i16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, [[VAL0]] +; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], [[VAL0]], [[SUB0]] +; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 + +; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, [[VAL1]] +; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], [[VAL1]], [[SUB1]] +; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 +define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 { + %z0 = insertelement <4 x i16> undef, i16 0, i16 0 + %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 + %z2 = insertelement <4 x i16> %z1, i16 0, i16 2 + %z3 = insertelement <4 x i16> %z2, i16 0, i16 3 + %t0 = insertelement <4 x i16> undef, i16 2, i16 0 + %t1 = insertelement <4 x i16> %t0, i16 2, i16 1 + %t2 = insertelement <4 x i16> %t1, i16 2, i16 2 + %t3 = insertelement <4 x i16> %t2, i16 2, i16 3 + %neg = sub <4 x i16> %z3, %val + %cond = icmp sgt <4 x i16> %val, %neg + %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg + %res2 = add <4 x i16> %res, %t3 + store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_abs_v4i16: +; GFX9: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} + +; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]] +; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]] +; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 + +; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]] +; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]] +; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 +define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 { + %z0 = insertelement <4 x i16> undef, i16 0, i16 0 + %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 + %z2 = insertelement <4 x i16> %z1, i16 0, i16 2 + %z3 = insertelement <4 x i16> %z2, i16 0, i16 3 + %t0 = insertelement <4 x i16> undef, i16 2, i16 0 + %t1 = insertelement <4 x i16> %t0, i16 2, i16 1 + %t2 = insertelement <4 x i16> %t1, i16 2, i16 2 + %t3 = insertelement <4 x i16> %t2, i16 2, i16 3 + %val = load <4 x i16>, <4 x i16> addrspace(1)* %src, align 4 + %neg = sub <4 x i16> %z3, %val + %cond = icmp sgt <4 x i16> %val, %neg + %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg + %res2 = add <4 x i16> %res, %t3 + store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_min_max_v2i16: +define amdgpu_kernel void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 { + %cond0 = icmp sgt <2 x i16> %val0, %val1 + %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 + %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 + + store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4 + store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_min_max_v2i16: +define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 { + %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0 + %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1 + + %cond0 = icmp sgt <2 x i16> %val0, %val1 + %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 + %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 + + store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4 + store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_min_max_v4i32: +define amdgpu_kernel void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 { + %cond0 = icmp sgt <4 x i16> %val0, %val1 + %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1 + %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0 + + store volatile <4 x i16> %sel0, <4 x i16> addrspace(1)* %out0, align 4 + store volatile <4 x i16> %sel1, <4 x i16> addrspace(1)* %out1, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_min_max_v2i16_user: +define amdgpu_kernel void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 { + %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0 + %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1 + + %cond0 = icmp sgt <2 x i16> %val0, %val1 + %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 + %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 + + store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4 + store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4 + store volatile <2 x i1> %cond0, <2 x i1> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}u_min_max_v2i16: +; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind { + %cond0 = icmp ugt <2 x i16> %val0, %val1 + %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 + %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 + + store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4 + store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll index daac5b92b1ef..343211b0219c 100644 --- a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -12,7 +12,7 @@ ; GCN: buffer_store_dword ; GCN: [[EXIT]]: ; GCN: s_endpgm -define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) { +define amdgpu_kernel void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) { entry: %cnd = fcmp oeq float 0.0, %cond %sgpr = load volatile i32, i32 addrspace(2)* %in @@ -32,7 +32,7 @@ endif: ; GCN: buffer_store_dword ; GCN: [[EXIT]]: ; GCN: s_endpgm -define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) { +define amdgpu_kernel void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) { entry: %vgpr = load volatile float, float addrspace(1)* %in %cnd = fcmp oeq float 0.0, %vgpr diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll index 9b118425f9cb..50f72c670598 100644 --- a/test/CodeGen/AMDGPU/smrd.ll +++ b/test/CodeGen/AMDGPU/smrd.ll @@ -1,16 +1,16 @@ -; RUN: llc < %s -march=amdgcn -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s -; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=CI --check-prefix=GCN %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s +; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SIVI %s ; SMRD load with an immediate offset. ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -18,11 +18,11 @@ entry: ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -33,11 +33,11 @@ entry: ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm -define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -48,11 +48,11 @@ entry: ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b ; TODO: Add VI checks ; GCN: s_endpgm -define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -62,11 +62,11 @@ entry: ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -76,11 +76,11 @@ entry: ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -88,12 +88,12 @@ entry: ; GCN-LABEL: {{^}}smrd_load_const0: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 -define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -102,14 +102,15 @@ main_body: ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1020) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } + ; SMRD load using the load.const intrinsic with an offset greater than the ; largets possible immediate. ; immediate offset. @@ -118,12 +119,12 @@ main_body: ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1024) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -133,12 +134,12 @@ main_body: ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048572) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -148,18 +149,17 @@ main_body: ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048576) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -attributes #0 = { nounwind readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/sopk-compares.ll b/test/CodeGen/AMDGPU/sopk-compares.ll index 74acc5bc961c..c0f773ca70c2 100644 --- a/test/CodeGen/AMDGPU/sopk-compares.ll +++ b/test/CodeGen/AMDGPU/sopk-compares.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.groupstaticsize() #1 ; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm: ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}} -define void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 4 br i1 %cmp0, label %endif, label %if @@ -25,7 +25,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max: ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x7fff{{$}} -define void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 32767 br i1 %cmp0, label %endif, label %if @@ -41,7 +41,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max_p1: ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0x8000{{$}} -define void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 32768 br i1 %cmp0, label %endif, label %if @@ -57,7 +57,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_ne_i32_simm16_max_p1: ; GCN: s_cmpk_lg_u32 s{{[0-9]+}}, 0x8000{{$}} -define void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ne i32 %cond, 32768 br i1 %cmp0, label %endif, label %if @@ -73,7 +73,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min: ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x8000{{$}} -define void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, -32768 br i1 %cmp0, label %endif, label %if @@ -89,7 +89,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min_m1: ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0xffff7fff{{$}} -define void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, -32769 br i1 %cmp0, label %endif, label %if @@ -105,7 +105,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm15_max: ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}} -define void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 65535 br i1 %cmp0, label %endif, label %if @@ -121,7 +121,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max: ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}} -define void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 65535 br i1 %cmp0, label %endif, label %if @@ -137,7 +137,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max_p1: ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0x10000{{$}} -define void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 65536 br i1 %cmp0, label %endif, label %if @@ -154,7 +154,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_eq_i32: ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -170,7 +170,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_ne_i32: ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ne i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -186,7 +186,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_sgt_i32: ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp sgt i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -202,7 +202,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max: ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x7fff{{$}} -define void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp sgt i32 %cond, 32767 br i1 %cmp0, label %endif, label %if @@ -218,7 +218,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max_p1: ; GCN: s_cmp_gt_i32 s{{[0-9]+}}, 0x8000{{$}} -define void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp sgt i32 %cond, 32768 br i1 %cmp0, label %endif, label %if @@ -234,7 +234,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_sge_i32: ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sge i32 %cond, %size @@ -251,7 +251,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_slt_i32: ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp slt i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -267,7 +267,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_sle_i32: ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sle i32 %cond, %size @@ -284,7 +284,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_ugt_i32: ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ugt i32 %cond, %size @@ -301,7 +301,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_uge_i32: ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp uge i32 %cond, %size @@ -318,7 +318,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_ult_i32: ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}} -define void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ult i32 %cond, 65 br i1 %cmp0, label %endif, label %if @@ -334,7 +334,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16: ; GCN: s_cmp_lt_u32 s2, 0xffff8000 -define void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ult i32 %cond, -32768 br i1 %cmp0, label %endif, label %if @@ -350,7 +350,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16_m1: ; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff7fff{{$}} -define void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ult i32 %cond, -32769 br i1 %cmp0, label %endif, label %if @@ -366,7 +366,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_ule_i32: ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}} -define void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ule i32 %cond, %size @@ -383,7 +383,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_eq_i32: ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp eq i32 %size, %cond @@ -400,7 +400,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_ne_i32: ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ne i32 %size, %cond @@ -417,7 +417,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_sgt_i32: ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sgt i32 %size, %cond @@ -434,7 +434,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_sge_i32: ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sge i32 %size, %cond @@ -451,7 +451,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_slt_i32: ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp slt i32 %size, %cond @@ -468,7 +468,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_sle_i32: ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp sle i32 %size, %cond @@ -485,7 +485,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_ugt_i32: ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ugt i32 %size, %cond @@ -502,7 +502,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_uge_i32: ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp uge i32 %size, %cond @@ -519,7 +519,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_ult_i32: ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ult i32 %size, %cond @@ -536,7 +536,7 @@ endif: ; GCN-LABEL: {{^}}commute_br_scc_ule_i32: ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}} -define void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %cmp0 = icmp ule i32 %size, %cond @@ -553,7 +553,7 @@ endif: ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16: ; GCN: s_cmp_lt_u32 s2, 0xfffff7ff -define void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() %not.size = xor i32 %size, -1 @@ -573,7 +573,7 @@ endif: ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4 ; SI: v_cmp_eq_u64_e64 -define void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i64 %cond, 4 br i1 %cmp0, label %endif, label %if @@ -593,7 +593,7 @@ endif: ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; SI: v_cmp_eq_u64_e32 -define void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp eq i64 %cond, 1234 br i1 %cmp0, label %endif, label %if @@ -611,7 +611,7 @@ endif: ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 4 ; SI: v_cmp_ne_u64_e64 -define void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ne i64 %cond, 4 br i1 %cmp0, label %endif, label %if @@ -631,7 +631,7 @@ endif: ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; SI: v_cmp_ne_u64_e32 -define void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = icmp ne i64 %cond, 1234 br i1 %cmp0, label %endif, label %if diff --git a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll index ff9429843b22..63ea21b05339 100644 --- a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll +++ b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll @@ -4,7 +4,7 @@ ; allocate scratch registers correctly. Check that this test compiles without ; error. ; TONGA-LABEL: test -define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) { entry: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo) diff --git a/test/CodeGen/AMDGPU/spill-cfg-position.ll b/test/CodeGen/AMDGPU/spill-cfg-position.ll new file mode 100644 index 000000000000..1ca0919258a8 --- /dev/null +++ b/test/CodeGen/AMDGPU/spill-cfg-position.ll @@ -0,0 +1,78 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s + +; Inline spiller can decide to move a spill as early as possible in the basic block. +; It will skip phis and label, but we also need to make sure it skips instructions +; in the basic block prologue which restore exec mask. +; Make sure instruction to restore exec mask immediately follows label + +; CHECK-LABEL: {{^}}spill_cfg_position: +; CHECK: s_cbranch_execz [[LABEL1:BB[0-9_]+]] +; CHECK: {{^}}[[LABEL1]]: +; CHECK: s_cbranch_execz [[LABEL2:BB[0-9_]+]] +; CHECK: {{^}}[[LABEL2]]: +; CHECK-NEXT: s_or_b64 exec +; CHECK: buffer_ + +define amdgpu_kernel void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tmp14 = load i32, i32 addrspace(1)* %arg, align 4 + %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 + %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + %tmp18 = load i32, i32 addrspace(1)* %tmp17, align 4 + %tmp19 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + %tmp20 = load i32, i32 addrspace(1)* %tmp19, align 4 + %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4 + %tmp22 = load i32, i32 addrspace(1)* %tmp21, align 4 + %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 5 + %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4 + %tmp25 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 6 + %tmp26 = load i32, i32 addrspace(1)* %tmp25, align 4 + %tmp27 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 7 + %tmp28 = load i32, i32 addrspace(1)* %tmp27, align 4 + %tmp29 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8 + %tmp30 = load i32, i32 addrspace(1)* %tmp29, align 4 + %tmp33 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp1 + %tmp34 = load i32, i32 addrspace(1)* %tmp33, align 4 + %tmp35 = icmp eq i32 %tmp34, 0 + br i1 %tmp35, label %bb44, label %bb36 + +bb36: ; preds = %bb + %tmp37 = mul nsw i32 %tmp20, %tmp18 + %tmp38 = add nsw i32 %tmp37, %tmp16 + %tmp39 = mul nsw i32 %tmp24, %tmp22 + %tmp40 = add nsw i32 %tmp38, %tmp39 + %tmp41 = mul nsw i32 %tmp28, %tmp26 + %tmp42 = add nsw i32 %tmp40, %tmp41 + %tmp43 = add nsw i32 %tmp42, %tmp30 + br label %bb52 + +bb44: ; preds = %bb + %tmp45 = mul nsw i32 %tmp18, %tmp16 + %tmp46 = mul nsw i32 %tmp22, %tmp20 + %tmp47 = add nsw i32 %tmp46, %tmp45 + %tmp48 = mul nsw i32 %tmp26, %tmp24 + %tmp49 = add nsw i32 %tmp47, %tmp48 + %tmp50 = mul nsw i32 %tmp30, %tmp28 + %tmp51 = add nsw i32 %tmp49, %tmp50 + br label %bb52 + +bb52: ; preds = %bb44, %bb36 + %tmp53 = phi i32 [ %tmp43, %bb36 ], [ %tmp51, %bb44 ] + %tmp54 = mul nsw i32 %tmp16, %tmp14 + %tmp55 = mul nsw i32 %tmp22, %tmp18 + %tmp56 = mul nsw i32 %tmp24, %tmp20 + %tmp57 = mul nsw i32 %tmp30, %tmp26 + %tmp58 = add i32 %tmp55, %tmp54 + %tmp59 = add i32 %tmp58, %tmp56 + %tmp60 = add i32 %tmp59, %tmp28 + %tmp61 = add i32 %tmp60, %tmp57 + %tmp62 = add i32 %tmp61, %tmp53 + store i32 %tmp62, i32 addrspace(1)* %tmp33, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll index 8c16b9d1649c..0e715c453209 100644 --- a/test/CodeGen/AMDGPU/spill-m0.ll +++ b/test/CodeGen/AMDGPU/spill-m0.ll @@ -17,11 +17,11 @@ ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]] -; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill +; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill ; TOVMEM: s_waitcnt vmcnt(0) ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 -; TOSMEM: s_mov_b32 m0, s3{{$}} +; TOSMEM: s_add_u32 m0, s3, 0x100{{$}} ; TOSMEM-NOT: [[M0_COPY]] ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill ; TOSMEM: s_waitcnt lgkmcnt(0) @@ -32,18 +32,18 @@ ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 0 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]] -; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload +; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Reload ; TOVMEM: s_waitcnt vmcnt(0) ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]] ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] -; TOSMEM: s_mov_b32 m0, s3{{$}} +; TOSMEM: s_add_u32 m0, s3, 0x100{{$}} ; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload ; TOSMEM-NOT: [[M0_RESTORE]] ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]] ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1 -define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 %cmp0 = icmp eq i32 %cond, 0 @@ -67,12 +67,12 @@ endif: ; GCN: v_interp_mov_f32 ; TOSMEM-NOT: s_m0 -; TOSMEM: s_mov_b32 m0, s7 +; TOSMEM: s_add_u32 m0, s7, 0x100 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill ; TOSMEM-NOT: m0 ; TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s7, 0x100 +; TOSMEM: s_add_u32 m0, s7, 0x200 ; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill ; TOSMEM-NOT: m0 @@ -81,16 +81,16 @@ endif: ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100 +; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 ; GCN-NOT: s_buffer_load_dword m0 -define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) #0 { +define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %m0) #0 { main_body: - %tmp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3) + %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0) %cmp = fcmp ueq float 0.000000e+00, %tmp br i1 %cmp, label %if, label %else @@ -100,14 +100,13 @@ if: ; preds = %main_body br label %endif else: ; preds = %main_body - %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3) + %interp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0) br label %endif endif: ; preds = %else, %if %export = phi float [ %lds_data, %if ], [ %interp, %else ] - %tmp4 = call i32 @llvm.SI.packf16(float %export, float %export) - %tmp5 = bitcast i32 %tmp4 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp5, float %tmp5, float %tmp5, float %tmp5) + %tmp4 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %export, float %export) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp4, <2 x half> %tmp4, i1 true, i1 true) #0 ret void } @@ -122,7 +121,7 @@ endif: ; preds = %else, %if ; GCN: ; clobber m0 ; TOSMEM: s_mov_b32 vcc_hi, m0 -; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill ; TOSMEM: s_mov_b32 m0, vcc_hi @@ -131,16 +130,16 @@ endif: ; preds = %else, %if ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_mov_b32 m0, s3 +; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 ; GCN-NOT: s_buffer_load_dword m0 -define void @m0_unavailable_spill(i32 %arg3) #0 { +define amdgpu_kernel void @m0_unavailable_spill(i32 %m0.arg) #0 { main_body: %m0 = call i32 asm sideeffect "; def $0, 1", "={M0}"() #0 - %tmp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3) + %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0.arg) call void asm sideeffect "; clobber $0", "~{M0}"() #0 %cmp = fcmp ueq float 0.000000e+00, %tmp br i1 %cmp, label %if, label %else @@ -161,10 +160,10 @@ endif: ; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_cmp_eq_u32 ; TOSMEM-NOT: m0 -; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill ; TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 +; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill ; TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 @@ -172,7 +171,7 @@ endif: ; TOSMEM: s_mov_b32 m0, -1 ; TOSMEM: s_mov_b32 vcc_hi, m0 -; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload ; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_waitcnt lgkmcnt(0) @@ -180,7 +179,7 @@ endif: ; TOSMEM: ds_write_b64 ; TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 +; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload ; TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) @@ -190,7 +189,7 @@ endif: ; TOSMEM: s_dcache_wb ; TOSMEM: s_endpgm -define void @restore_m0_lds(i32 %arg) { +define amdgpu_kernel void @restore_m0_lds(i32 %arg) { %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 %sval = load volatile i64, i64 addrspace(2)* undef %cmp = icmp eq i32 %arg, 0 @@ -205,10 +204,10 @@ ret: ret void } -declare float @llvm.SI.fs.constant(i32, i32, i32) readnone - -declare i32 @llvm.SI.packf16(float, float) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 9b3dfab2be6a..c05021a91ff0 100644 --- a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -11,7 +11,7 @@ ; Just test that it compiles successfully. ; CHECK-LABEL: test -define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) { entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) diff --git a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll index cab45be8da50..ebba35a6689a 100644 --- a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll +++ b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -3,11 +3,11 @@ ; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s ; ALL-LABEL: {{^}}spill_sgpr_x2: -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill ; SMEM: s_cbranch_scc1 -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload ; SMEM: s_dcache_wb @@ -44,11 +44,11 @@ ret: } ; ALL-LABEL: {{^}}spill_sgpr_x4: -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Spill ; SMEM: s_cbranch_scc1 -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Reload ; SMEM: s_dcache_wb ; SMEM: s_endpgm @@ -93,15 +93,15 @@ ret: ; ALL-LABEL: {{^}}spill_sgpr_x8: -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill -; SMEM: s_add_u32 m0, s3, 16 +; SMEM: s_add_u32 m0, s3, 0x110{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill ; SMEM: s_cbranch_scc1 -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload -; SMEM: s_add_u32 m0, s3, 16 +; SMEM: s_add_u32 m0, s3, 0x110{{$}} ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload ; SMEM: s_dcache_wb diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll index d4e2dc814050..5d7d29db3a2f 100644 --- a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll +++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0: ; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}} ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc -define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) { +define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) { %v.val = load volatile i32, i32 addrspace(1)* %in %vec.0 = insertelement <2 x i32> undef, i32 %s.val, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1 @@ -23,7 +23,7 @@ define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_0: ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x18f ; SI: s_addc_u32 {{s[0-9]+}}, 0xf423f, 0 -define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { +define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1 %bc = bitcast <2 x i32> %vec.1 to i64 @@ -35,7 +35,7 @@ define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1: ; SI: v_add_i32 ; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { +define amdgpu_kernel void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { %v.val = load volatile i32, i32 addrspace(1)* %in %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1 @@ -48,7 +48,7 @@ define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_1: ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; SI: s_addc_u32 {{s[0-9]+}}, 0x1869f, {{s[0-9]+}} -define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { +define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1 %bc = bitcast <2 x i32> %vec.1 to i64 @@ -61,7 +61,7 @@ define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i6 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_2: ; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}} ; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc -define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { +define amdgpu_kernel void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep diff --git a/test/CodeGen/AMDGPU/split-smrd.ll b/test/CodeGen/AMDGPU/split-smrd.ll index d07da1030936..cdb1b1e3b503 100644 --- a/test/CodeGen/AMDGPU/split-smrd.ll +++ b/test/CodeGen/AMDGPU/split-smrd.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; FIXME: Move this to sgpr-copy.ll when this is fixed on VI. ; Make sure that when we split an smrd instruction in order to move it to ; the VALU, we are also moving its users to the VALU. -; CHECK-LABEL: {{^}}split_smrd_add_worklist: -; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 +; GCN-LABEL: {{^}}split_smrd_add_worklist: +; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { bb: %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) @@ -21,27 +21,22 @@ bb3: ; preds = %bb %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6 %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 - %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp10 = extractelement <4 x float> %tmp9, i32 0 - %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef) - %tmp13 = bitcast i32 %tmp12 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp13, float undef, float undef) + %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void } -; Function Attrs: nounwind readnone +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare i32 @llvm.SI.packf16(float, float) #1 - attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} -!1 = !{!"const", !3} -!2 = !{!1, !1, i64 0} -!3 = !{!"tbaa root"} +!1 = !{!"const", !2} +!2 = !{!"tbaa root"} diff --git a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll index 37ec2b012896..c2426993bb3a 100644 --- a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll +++ b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s @sPrivateStorage = internal addrspace(3) global [256 x [8 x <4 x i64>]] undef @@ -29,7 +29,7 @@ ; GCN-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 ; GCN: s_endpgm -define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 { +define amdgpu_kernel void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 { entry: %tmp = tail call i32 @llvm.r600.read.local.size.y() %tmp1 = tail call i32 @llvm.r600.read.local.size.z() diff --git a/test/CodeGen/AMDGPU/splitkit.mir b/test/CodeGen/AMDGPU/splitkit.mir new file mode 100644 index 000000000000..41782af40e3c --- /dev/null +++ b/test/CodeGen/AMDGPU/splitkit.mir @@ -0,0 +1,105 @@ +# RUN: llc -o - %s -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s +--- | + define amdgpu_kernel void @func0() #0 { ret void } + define amdgpu_kernel void @func1() #0 { ret void } + define amdgpu_kernel void @splitHoist() #0 { ret void } + + attributes #0 = { "amdgpu-num-sgpr"="12" } +... +--- +# Make sure we only get a single spill+reload even if liverange splitting +# created a sequence of multiple copy instructions. +# CHECK-LABEL: name: func0 +# CHECK: SI_SPILL_S128_SAVE +# CHECK-NOT: SI_SPILL_S128_SAVE +# CHECK: S_NOP 0 +# CHECK: SI_SPILL_S128_RESTORE +# CHECK-NOT: SI_SPILL_S128_RESTORE +name: func0 +body: | + bb.0: + S_NOP 0, implicit-def undef %0.sub0 : sreg_128 + S_NOP 0, implicit-def %0.sub3 : sreg_128 + + ; Clobber registers + S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11 + + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub3 + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub3 +... +--- +# LiveRange splitting should split this into 2 intervals with the second getting +# allocated to sgpr0_sgpr1 and the first to something else so we see two copies +# in between for the two subregisters that are alive. +# CHECK-LABEL: name: func1 +# CHECK: [[REG0:%sgpr[0-9]+]] = COPY %sgpr0 +# CHECK: [[REG1:%sgpr[0-9]+]] = COPY %sgpr2 +# CHECK: S_NOP 0 +# CHECK: S_NOP 0, implicit [[REG0]] +# CHECK: S_NOP 0, implicit [[REG1]] +# CHECK: %sgpr0 = COPY [[REG0]] +# CHECK: %sgpr2 = COPY [[REG1]] +# CHECK: S_NOP +# CHECK: S_NOP 0, implicit %sgpr0 +# CHECK: S_NOP 0, implicit %sgpr2 +name: func1 +tracksRegLiveness: true +body: | + bb.0: + liveins: %sgpr0, %sgpr1, %sgpr2 + undef %0.sub0 : sreg_128 = COPY %sgpr0 + %0.sub2 = COPY %sgpr2 + + S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1 + + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub2 + + ; Clobber everything but sgpr0-sgpr3 + S_NOP 0, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11, implicit-def dead %sgpr12, implicit-def dead %sgpr13, implicit-def dead %sgpr14, implicit-def dead %sgpr15, implicit-def dead %vcc_lo, implicit-def dead %vcc_hi + + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub2 +... +--- +# Check that copy hoisting out of loops works. This mainly should not crash the +# compiler when it hoists a subreg copy sequence. +# CHECK-LABEL: name: splitHoist +# CHECK: S_NOP 0, implicit-def %sgpr0 +# CHECK: S_NOP 0, implicit-def %sgpr3 +# CHECK-NEXT: SI_SPILL_S128_SAVE +name: splitHoist +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + S_NOP 0, implicit-def undef %0.sub0 : sreg_128 + S_NOP 0, implicit-def %0.sub3 : sreg_128 + + S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc + S_BRANCH %bb.2 + + bb.1: + successors: %bb.1, %bb.3 + S_NOP 0, implicit %0.sub0 + + ; Clobber registers + S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11 + + S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + ; Clobber registers + S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11 + S_BRANCH %bb.3 + + bb.3: + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub3 + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub3 +... diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll index ad7c86fe7919..b4355b76016a 100644 --- a/test/CodeGen/AMDGPU/sra.ll +++ b/test/CodeGen/AMDGPU/sra.ll @@ -13,7 +13,7 @@ declare i32 @llvm.r600.read.tidig.x() #0 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -37,7 +37,7 @@ define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -49,9 +49,9 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i ; FUNC-LABEL: {{^}}ashr_v2i16: ; FIXME: The ashr operation is uniform, but because its operands come from a ; global load we end up with the vector instructions rather than scalar. -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr @@ -63,11 +63,11 @@ define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %i ; FUNC-LABEL: {{^}}ashr_v4i16: ; FIXME: The ashr operation is uniform, but because its operands come from a ; global load we end up with the vector instructions rather than scalar. -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr @@ -80,7 +80,7 @@ define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %i ; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 ; EG: ASHR -define void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) { entry: %in.ext = sext i32 %in to i64 %ashr = ashr i64 %in.ext, 8 @@ -105,7 +105,7 @@ entry: ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} -define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { entry: %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %a = load i64, i64 addrspace(1)* %in @@ -143,7 +143,7 @@ entry: ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr @@ -156,7 +156,7 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i ; XFUNC-LABEL: {{^}}s_ashr_v2i64: ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}} ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}} -; define void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) { +; define amdgpu_kernel void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) { ; %result = ashr <2 x i64> %a, %b ; store <2 x i64> %result, <2 x i64> addrspace(1)* %out ; ret void @@ -221,7 +221,7 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr @@ -235,7 +235,7 @@ define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 ; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}} ; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}} -define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 32 %add = add i64 %result, %b store i64 %add, i64 addrspace(1)* %out @@ -247,7 +247,7 @@ define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; VI: flat_load_dword v[[HI:[0-9]+]] ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]] ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[HI]]:[[SHIFT]]{{\]}} -define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -262,7 +262,7 @@ define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 ; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} -define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 63 %add = add i64 %result, %b store i64 %add, i64 addrspace(1)* %out @@ -275,7 +275,7 @@ define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]] ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[SHIFT]] ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[SHIFT]]:[[COPY]]{{\]}} -define void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll index c78fd549b316..c89f798397ae 100644 --- a/test/CodeGen/AMDGPU/srem.ll +++ b/test/CodeGen/AMDGPU/srem.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s -define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in %den = load i32, i32 addrspace(1) * %den_ptr @@ -11,7 +11,7 @@ define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ret void } -define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = srem i32 %num, 4 store i32 %result, i32 addrspace(1)* %out @@ -24,14 +24,14 @@ define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI: v_mul_lo_i32 ; SI: v_sub_i32 ; SI: s_endpgm -define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = srem i32 %num, 7 store i32 %result, i32 addrspace(1)* %out ret void } -define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr @@ -40,14 +40,14 @@ define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i ret void } -define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %num = load <2 x i32>, <2 x i32> addrspace(1) * %in %result = srem <2 x i32> %num, store <2 x i32> %result, <2 x i32> addrspace(1)* %out ret void } -define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr @@ -56,14 +56,14 @@ define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i ret void } -define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %num = load <4 x i32>, <4 x i32> addrspace(1) * %in %result = srem <4 x i32> %num, store <4 x i32> %result, <4 x i32> addrspace(1)* %out ret void } -define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %num = load i64, i64 addrspace(1) * %in %den = load i64, i64 addrspace(1) * %den_ptr @@ -72,14 +72,14 @@ define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ret void } -define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %num = load i64, i64 addrspace(1) * %in %result = srem i64 %num, 4 store i64 %result, i64 addrspace(1)* %out ret void } -define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %num = load <2 x i64>, <2 x i64> addrspace(1) * %in %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr @@ -88,14 +88,14 @@ define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i ret void } -define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %num = load <2 x i64>, <2 x i64> addrspace(1) * %in %result = srem <2 x i64> %num, store <2 x i64> %result, <2 x i64> addrspace(1)* %out ret void } -define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %num = load <4 x i64>, <4 x i64> addrspace(1) * %in %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr @@ -104,7 +104,7 @@ define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i ret void } -define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %num = load <4 x i64>, <4 x i64> addrspace(1) * %in %result = srem <4 x i64> %num, store <4 x i64> %result, <4 x i64> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll index 6b006fd936d7..1daf4bb33e81 100644 --- a/test/CodeGen/AMDGPU/srl.ll +++ b/test/CodeGen/AMDGPU/srl.ll @@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() #0 ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -26,7 +26,7 @@ define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -50,7 +50,7 @@ define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -74,7 +74,7 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}} ; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]] ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 -define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %a = load i64, i64 addrspace(1)* %in %b = load i64, i64 addrspace(1)* %b_ptr @@ -112,7 +112,7 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; EG-DAG: CNDE_INT {{.*}}, 0.0 ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr @@ -178,7 +178,7 @@ define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr @@ -193,7 +193,7 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { %result = lshr i64 %a, 32 store i64 %result, i64 addrspace(1)* %out ret void @@ -203,7 +203,7 @@ define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { ; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}} -define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll index 26884a1b7761..135632343f90 100644 --- a/test/CodeGen/AMDGPU/ssubo.ll +++ b/test/CodeGen/AMDGPU/ssubo.ll @@ -6,7 +6,7 @@ declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone ; FUNC-LABEL: {{^}}ssubo_i64_zext: -define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %ssub, 0 %carry = extractvalue { i64, i1 } %ssub, 1 @@ -17,7 +17,7 @@ define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { } ; FUNC-LABEL: {{^}}s_ssubo_i32: -define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { +define amdgpu_kernel void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %ssub, 0 %carry = extractvalue { i32, i1 } %ssub, 1 @@ -27,7 +27,7 @@ define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 } ; FUNC-LABEL: {{^}}v_ssubo_i32: -define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -41,7 +41,7 @@ define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 ; FUNC-LABEL: {{^}}s_ssubo_i64: ; SI: s_sub_u32 ; SI: s_subb_u32 -define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %ssub, 0 %carry = extractvalue { i64, i1 } %ssub, 1 @@ -53,7 +53,7 @@ define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 ; FUNC-LABEL: {{^}}v_ssubo_i64: ; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +define amdgpu_kernel void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll index 57a93ccd2505..afa4e94222cd 100644 --- a/test/CodeGen/AMDGPU/store-barrier.ll +++ b/test/CodeGen/AMDGPU/store-barrier.ll @@ -12,7 +12,7 @@ ; CHECK: s_barrier ; CHECK: s_endpgm ; Function Attrs: nounwind -define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 { +define amdgpu_kernel void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 { bb: %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9 %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2 diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll index 5d49795a68ec..160e921fc075 100644 --- a/test/CodeGen/AMDGPU/store-global.ll +++ b/test/CodeGen/AMDGPU/store-global.ll @@ -11,7 +11,7 @@ ; CM-NOT: MEM_RAT MSKOR ; GCN: buffer_store_byte -define void @store_i1(i1 addrspace(1)* %out) { +define amdgpu_kernel void @store_i1(i1 addrspace(1)* %out) { entry: store i1 true, i1 addrspace(1)* %out ret void @@ -42,7 +42,7 @@ entry: ; GCN: buffer_store_byte -define void @store_i8(i8 addrspace(1)* %out, i8 %in) { +define amdgpu_kernel void @store_i8(i8 addrspace(1)* %out, i8 %in) { entry: store i8 %in, i8 addrspace(1)* %out ret void @@ -75,7 +75,7 @@ entry: ; EG: MOV * T[[RW_GPR]].Z, 0.0 ; GCN: buffer_store_short -define void @store_i16(i16 addrspace(1)* %out, i16 %in) { +define amdgpu_kernel void @store_i16(i16 addrspace(1)* %out, i16 %in) { entry: store i16 %in, i16 addrspace(1)* %out ret void @@ -88,7 +88,7 @@ entry: ; EG: MEM_RAT MSKOR ; EG: MEM_RAT MSKOR -define void @store_i24(i24 addrspace(1)* %out, i24 %in) { +define amdgpu_kernel void @store_i24(i24 addrspace(1)* %out, i24 %in) { entry: store i24 %in, i24 addrspace(1)* %out ret void @@ -104,7 +104,7 @@ entry: ; CM: MEM_RAT_CACHELESS STORE_DWORD ; CM-NOT: MEM_RAT -define void @store_i25(i25 addrspace(1)* %out, i25 %in) { +define amdgpu_kernel void @store_i25(i25 addrspace(1)* %out, i25 %in) { entry: store i25 %in, i25 addrspace(1)* %out ret void @@ -119,7 +119,7 @@ entry: ; CM-NOT: MEM_RAT MSKOR ; GCN: buffer_store_short -define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> store <2 x i8> %0, <2 x i8> addrspace(1)* %out @@ -136,7 +136,7 @@ entry: ; CM-NOT: MEM_RAT MSKOR ; SI: buffer_store_byte -define void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1 @@ -150,7 +150,7 @@ entry: ; CM: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dword -define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> store <2 x i16> %0, <2 x i16> addrspace(1)* %out @@ -170,7 +170,7 @@ entry: ; SI: buffer_store_short ; SI: buffer_store_short -define void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2 @@ -183,7 +183,7 @@ entry: ; CM: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dword -define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(1)* %out @@ -210,7 +210,7 @@ entry: ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI-NOT: buffer_store_dword -define void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1 @@ -231,7 +231,7 @@ entry: ; SI: buffer_store_short ; SI: buffer_store_short ; SI-NOT: buffer_store_dword -define void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2 @@ -246,7 +246,7 @@ entry: ; GCN: buffer_store_dword -define void @store_f32(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) { store float %in, float addrspace(1)* %out ret void } @@ -257,7 +257,7 @@ define void @store_f32(float addrspace(1)* %out, float %in) { ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}} ; GCN: buffer_store_dwordx2 -define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i16> store <4 x i16> %0, <4 x i16> addrspace(1)* %out @@ -272,7 +272,7 @@ entry: ; GCN: buffer_store_dwordx2 -define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { +define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { entry: %0 = insertelement <2 x float> , float %a, i32 0 %1 = insertelement <2 x float> %0, float %b, i32 1 @@ -286,7 +286,7 @@ entry: ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}}, -define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { +define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16 ret void } @@ -299,7 +299,7 @@ define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dwordx4 -define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(1)* %out ret void @@ -313,7 +313,7 @@ entry: ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; SI: buffer_store_dwordx4 -define void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 ret void @@ -328,7 +328,7 @@ entry: ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dwordx4 -define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %1 = load <4 x float>, <4 x float> addrspace(1) * %in store <4 x float> %1, <4 x float> addrspace(1)* %out ret void @@ -340,7 +340,7 @@ define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1 ; CM: MEM_RAT MSKOR ; GCN: buffer_store_byte -define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { entry: %0 = trunc i64 %in to i8 store i8 %0, i8 addrspace(1)* %out @@ -350,7 +350,7 @@ entry: ; FUNC-LABEL: {{^}}store_i64_i16: ; EG: MEM_RAT MSKOR ; GCN: buffer_store_short -define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { entry: %0 = trunc i64 %in to i16 store i16 %0, i16 addrspace(1)* %out @@ -369,7 +369,7 @@ entry: ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dwordx2 -define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { entry: %0 = load i32, i32 addrspace(2)* %mem, align 4 %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 @@ -388,7 +388,7 @@ entry: ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X ; GCN: buffer_store_dwordx4 -define void @i128-const-store(i32 addrspace(1)* %out) { +define amdgpu_kernel void @i128-const-store(i32 addrspace(1)* %out) { entry: store i32 1, i32 addrspace(1)* %out, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 diff --git a/test/CodeGen/AMDGPU/store-local.ll b/test/CodeGen/AMDGPU/store-local.ll index 03fd30ca9a25..c144bf2aa878 100644 --- a/test/CodeGen/AMDGPU/store-local.ll +++ b/test/CodeGen/AMDGPU/store-local.ll @@ -9,7 +9,7 @@ ; CM: LDS_BYTE_WRITE ; GCN: ds_write_b8 -define void @store_local_i1(i1 addrspace(3)* %out) { +define amdgpu_kernel void @store_local_i1(i1 addrspace(3)* %out) { entry: store i1 true, i1 addrspace(3)* %out ret void @@ -21,7 +21,7 @@ entry: ; CM: LDS_BYTE_WRITE ; GCN: ds_write_b8 -define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { +define amdgpu_kernel void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { store i8 %in, i8 addrspace(3)* %out ret void } @@ -32,7 +32,7 @@ define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { ; CM: LDS_SHORT_WRITE ; GCN: ds_write_b16 -define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { +define amdgpu_kernel void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { store i16 %in, i16 addrspace(3)* %out ret void } @@ -43,7 +43,7 @@ define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { ; CM: LDS_WRITE ; GCN: ds_write_b32 -define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { +define amdgpu_kernel void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(3)* %out ret void @@ -55,7 +55,7 @@ entry: ; CM: LDS_WRITE ; GCN: ds_write_b32 -define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +define amdgpu_kernel void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(3)* %out ret void @@ -78,7 +78,7 @@ entry: ; GCN: ds_write_b8 ; GCN: ds_write_b8 ; GCN: ds_write_b8 -define void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +define amdgpu_kernel void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 1 ret void @@ -95,7 +95,7 @@ entry: ; GCN: ds_write_b16 ; GCN: ds_write_b16 -define void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +define amdgpu_kernel void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 2 ret void @@ -111,7 +111,7 @@ entry: ; CM-NOT: LDS_WRITE ; GCN: ds_write_b64 -define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { entry: store <2 x i32> %in, <2 x i32> addrspace(3)* %out ret void @@ -129,7 +129,7 @@ entry: ; CM: LDS_WRITE ; GCN: ds_write2_b64 -define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(3)* %out ret void @@ -148,7 +148,7 @@ entry: ; GCN: ds_write2_b32 ; GCN: ds_write2_b32 -define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4 ret void @@ -157,7 +157,7 @@ entry: ; FUNC-LABEL: {{^}}store_local_i64_i8: ; EG: LDS_BYTE_WRITE ; GCN: ds_write_b8 -define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { +define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { entry: %0 = trunc i64 %in to i8 store i8 %0, i8 addrspace(3)* %out @@ -167,7 +167,7 @@ entry: ; FUNC-LABEL: {{^}}store_local_i64_i16: ; EG: LDS_SHORT_WRITE ; GCN: ds_write_b16 -define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { +define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { entry: %0 = trunc i64 %in to i16 store i16 %0, i16 addrspace(3)* %out diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll index 33d27f24e9cf..ab73ada370ea 100644 --- a/test/CodeGen/AMDGPU/store-private.ll +++ b/test/CodeGen/AMDGPU/store-private.ll @@ -15,7 +15,7 @@ ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_byte -define void @store_i1(i1 addrspace(0)* %out) { +define amdgpu_kernel void @store_i1(i1 addrspace(0)* %out) { entry: store i1 true, i1 addrspace(0)* %out ret void @@ -44,7 +44,7 @@ entry: ; SI: buffer_store_byte -define void @store_i8(i8 addrspace(0)* %out, i8 %in) { +define amdgpu_kernel void @store_i8(i8 addrspace(0)* %out, i8 %in) { entry: store i8 %in, i8 addrspace(0)* %out ret void @@ -72,7 +72,7 @@ entry: ; EG: MOV * T(0 + AR.x).X+, [[RES]] ; SI: buffer_store_short -define void @store_i16(i16 addrspace(0)* %out, i16 %in) { +define amdgpu_kernel void @store_i16(i16 addrspace(0)* %out, i16 %in) { entry: store i16 %in, i16 addrspace(0)* %out ret void @@ -102,7 +102,7 @@ entry: ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+, ; CM: MOVA_INT ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, -define void @store_i24(i24 addrspace(0)* %out, i24 %in) { +define amdgpu_kernel void @store_i24(i24 addrspace(0)* %out, i24 %in) { entry: store i24 %in, i24 addrspace(0)* %out ret void @@ -120,7 +120,7 @@ entry: ; CM: MOVA_INT ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; CM-NOT: MOVA_INT -define void @store_i25(i25 addrspace(0)* %out, i25 %in) { +define amdgpu_kernel void @store_i25(i25 addrspace(0)* %out, i25 %in) { entry: store i25 %in, i25 addrspace(0)* %out ret void @@ -141,7 +141,7 @@ entry: ; CM-NOT: MOVA_INT ; SI: buffer_store_short -define void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> store <2 x i8> %0, <2 x i8> addrspace(0)* %out @@ -172,7 +172,7 @@ entry: ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_byte -define void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1 @@ -191,7 +191,7 @@ entry: ; CM-NOT: MOVA_INT ; SI: buffer_store_dword -define void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> store <2 x i16> %0, <2 x i16> addrspace(0)* %out @@ -223,7 +223,7 @@ entry: ; SI: buffer_store_short ; SI: buffer_store_short -define void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { +define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2 @@ -240,7 +240,7 @@ entry: ; CM-NOT: MOVA_INT ; SI: buffer_store_dword -define void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(0)* %out @@ -299,7 +299,7 @@ entry: ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI-NOT: buffer_store_dword -define void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1 @@ -410,7 +410,7 @@ entry: ; SI: buffer_store_byte ; SI: buffer_store_byte ; SI-NOT: buffer_store_dword -define void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) { +define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) { entry: %0 = trunc <8 x i32> %in to <8 x i8> store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1 @@ -443,7 +443,7 @@ entry: ; SI: buffer_store_short ; SI: buffer_store_short ; SI-NOT: buffer_store_dword -define void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2 @@ -460,7 +460,7 @@ entry: ; SI: buffer_store_dword -define void @store_f32(float addrspace(0)* %out, float %in) { +define amdgpu_kernel void @store_f32(float addrspace(0)* %out, float %in) { store float %in, float addrspace(0)* %out ret void } @@ -480,7 +480,7 @@ define void @store_f32(float addrspace(0)* %out, float %in) { ; XSI: buffer_store_dwordx2 ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i16> store <4 x i16> %0, <4 x i16> addrspace(0)* %out @@ -504,7 +504,7 @@ entry: ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) { +define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) { entry: %0 = insertelement <2 x float> , float %a, i32 0 %1 = insertelement <2 x float> %0, float %b, i32 1 @@ -533,7 +533,7 @@ entry: ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind { +define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind { store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16 ret void } @@ -563,7 +563,7 @@ define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind { ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(0)* %out ret void @@ -594,7 +594,7 @@ entry: ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { +define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4 ret void @@ -626,7 +626,7 @@ entry: ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) { +define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) { %1 = load <4 x float>, <4 x float> addrspace(0) * %in store <4 x float> %1, <4 x float> addrspace(0)* %out ret void @@ -644,7 +644,7 @@ define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_byte -define void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) { entry: %0 = trunc i64 %in to i8 store i8 %0, i8 addrspace(0)* %out @@ -663,7 +663,7 @@ entry: ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+, ; SI: buffer_store_short -define void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) { +define amdgpu_kernel void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) { entry: %0 = trunc i64 %in to i16 store i16 %0, i16 addrspace(0)* %out @@ -689,7 +689,7 @@ entry: ; XSI: buffer_store_dwordx2 ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +define amdgpu_kernel void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { entry: %0 = load i32, i32 addrspace(2)* %mem, align 4 %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 @@ -727,7 +727,7 @@ entry: ; SI: buffer_store_dword ; SI: buffer_store_dword ; SI: buffer_store_dword -define void @i128-const-store(i32 addrspace(0)* %out) { +define amdgpu_kernel void @i128-const-store(i32 addrspace(0)* %out) { entry: store i32 1, i32 addrspace(0)* %out, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1 diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll index 78db2d37724b..7518e887135c 100644 --- a/test/CodeGen/AMDGPU/store-v3i64.ll +++ b/test/CodeGen/AMDGPU/store-v3i64.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}global_store_v3i64: ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32 ret void } @@ -40,7 +40,7 @@ define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: buffer_store_byte -define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 ret void } @@ -48,7 +48,7 @@ define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64 ; GCN-LABEL: {{^}}local_store_v3i64: ; GCN: ds_write2_b64 ; GCN: ds_write_b64 -define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { +define amdgpu_kernel void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32 ret void } @@ -83,7 +83,7 @@ define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { ; GCN: ds_write_b8 ; GCN: ds_write_b8 ; GCN: ds_write_b8 -define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { +define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 1 ret void } @@ -91,7 +91,7 @@ define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32: ; GCN-DAG: buffer_store_dwordx2 ; GCN-DAG: buffer_store_dword v -define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i32> store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out ret void @@ -100,7 +100,7 @@ define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i16: ; GCN-DAG: buffer_store_short ; GCN-DAG: buffer_store_dword v -define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i16> store <3 x i16> %trunc, <3 x i16> addrspace(1)* %out ret void @@ -110,7 +110,7 @@ define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i8: ; GCN-DAG: buffer_store_short ; GCN-DAG: buffer_store_byte v -define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i8> store <3 x i8> %trunc, <3 x i8> addrspace(1)* %out ret void @@ -120,7 +120,7 @@ define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i ; GCN-DAG: buffer_store_byte v ; GCN-DAG: buffer_store_byte v ; GCN-DAG: buffer_store_byte v -define void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) { +define amdgpu_kernel void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i1> store <3 x i1> %trunc, <3 x i1> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/test/CodeGen/AMDGPU/store-vector-ptrs.ll index d5af3b29118a..507f07dee052 100644 --- a/test/CodeGen/AMDGPU/store-vector-ptrs.ll +++ b/test/CodeGen/AMDGPU/store-vector-ptrs.ll @@ -5,7 +5,7 @@ ; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting ; scratch loads and stores. ; CHECK-LABEL: {{^}}store_vector_ptrs: -define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind { +define amdgpu_kernel void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind { %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> store <4 x i32*> %p, <4 x i32*>* %out ret void diff --git a/test/CodeGen/AMDGPU/store_typed.ll b/test/CodeGen/AMDGPU/store_typed.ll index 515fcf04f406..eaa21617f937 100644 --- a/test/CodeGen/AMDGPU/store_typed.ll +++ b/test/CodeGen/AMDGPU/store_typed.ll @@ -6,7 +6,7 @@ ; EG: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}, 1 ; CM: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}} -define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) { +define amdgpu_kernel void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) { call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 0) ret void } @@ -16,7 +16,7 @@ define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) { ; EG: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}, 1 ; CM: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}} -define void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) { +define amdgpu_kernel void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) { call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 11) ret void } diff --git a/test/CodeGen/AMDGPU/structurize.ll b/test/CodeGen/AMDGPU/structurize.ll index 174e64e2cf8b..3cceb2d45c93 100644 --- a/test/CodeGen/AMDGPU/structurize.ll +++ b/test/CodeGen/AMDGPU/structurize.ll @@ -45,7 +45,7 @@ ; CHECK: CF_END -define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %0 = icmp ne i32 %a, 0 br i1 %0, label %diamond_head, label %branch_from diff --git a/test/CodeGen/AMDGPU/structurize1.ll b/test/CodeGen/AMDGPU/structurize1.ll index db0f50247e38..2e7d0e615e07 100644 --- a/test/CodeGen/AMDGPU/structurize1.ll +++ b/test/CodeGen/AMDGPU/structurize1.ll @@ -19,7 +19,7 @@ ; CHECK-LABEL: {{^}}if_inside_loop: ; CHECK: LOOP_START_DX10 ; CHECK: END_LOOP -define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +define amdgpu_kernel void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { entry: br label %for.body diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll index b5d5f56b2796..6642411f7a63 100644 --- a/test/CodeGen/AMDGPU/sub.i16.ll +++ b/test/CodeGen/AMDGPU/sub.i16.ll @@ -7,7 +7,7 @@ ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -24,7 +24,7 @@ define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffff85, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -39,7 +39,7 @@ define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* % ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -52,9 +52,9 @@ define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1 ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_i16_inline_63: ; VI: flat_load_ushort [[A:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffc1, [[A]] +; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]] ; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -70,7 +70,7 @@ define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: buffer_store_dword [[ADD]] -define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -85,12 +85,12 @@ define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64: +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -110,7 +110,7 @@ define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] -define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -131,7 +131,7 @@ define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1) ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { +define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid @@ -149,7 +149,7 @@ define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) ; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute: ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}} ; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}} -define void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { +define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { %size = call i32 @llvm.amdgcn.groupstaticsize() %size.trunc = trunc i32 %size to i16 call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds) diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll index 5816345098af..f366029fdea2 100644 --- a/test/CodeGen/AMDGPU/sub.ll +++ b/test/CodeGen/AMDGPU/sub.ll @@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() readnone ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -25,7 +25,7 @@ define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -45,7 +45,7 @@ define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1) ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -55,7 +55,7 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1) } ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %a = load i16, i16 addrspace(1)* %in %b = load i16, i16 addrspace(1)* %b_ptr @@ -69,7 +69,7 @@ define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 %a = load <2 x i16>, <2 x i16> addrspace(1) * %in %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr @@ -85,7 +85,7 @@ define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1) ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 %a = load <4 x i16>, <4 x i16> addrspace(1) * %in %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr @@ -103,7 +103,7 @@ define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1) ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT ; EG-DAG: SUB_INT {{[* ]*}} -define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { +define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { %result = sub i64 %a, %b store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -118,7 +118,7 @@ define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT ; EG-DAG: SUB_INT {{[* ]*}} -define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { +define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid @@ -134,7 +134,7 @@ define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias ; SI: v_subb_u32_e32 ; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { +define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid @@ -154,7 +154,7 @@ define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace( ; SI: v_subb_u32_e32 ; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { +define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll new file mode 100644 index 000000000000..69f0accef628 --- /dev/null +++ b/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -0,0 +1,278 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16: +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_v2i16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]] + +; VI: s_sub_i32 +; VI: s_sub_i32 +define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 + %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1 + %add = sub <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_self_v2i16: +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]] +; GCN: buffer_store_dword [[ZERO]] +define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 + %add = sub <2 x i16> %a, %a + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: VI should not scalarize arg access. +; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg: +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} + +; VI: v_subrev_i32_e32 +; VI: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { + %add = sub <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_v2i16_constant: +; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} + +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}} +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}} +define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant: +; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} + +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}} +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}} +define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1: +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}} + +; VI: flat_load_ushort [[LOAD0:v[0-9]+]] +; VI: flat_load_ushort [[LOAD1:v[0-9]+]] +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD0]] +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]] +; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi: +; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} + +; VI-NOT: v_subrev_i16 +; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}} +; VI-NOT: v_subrev_i16 +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; The high element gives fp +; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split: +; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0 +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} + +; VI-NOT: v_subrev_i16 +; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffc080, v{{[0-9]+}} +; VI-NOT: v_subrev_i16 +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i32: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} + +; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_ushort v[[A_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_ushort v[[B_LO:[0-9]+]] + +; VI: v_subrev_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] +; VI-NOT: and +; VI-NOT: shl +; VI: v_subrev_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; VI-NOT: and +; VI-NOT: shl +; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} +define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + %ext = zext <2 x i16> %add to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx4 + +; VI: flat_load_ushort v[[A_LO:[0-9]+]] +; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_ushort v[[B_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] + +; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI-DAG: v_subrev_u16_e32 +; VI-DAG: v_subrev_u16_e32 + +; VI: buffer_store_dwordx4 +define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + %ext = zext <2 x i16> %add to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i32: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16 +; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} + +; VI: v_subrev_u16_e32 +; VI: v_subrev_u16_e32 +; VI: buffer_store_dwordx2 +define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + %ext = sext <2 x i16> %add to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i64: +; GCN: flat_load_dword +; GCN: flat_load_dword + +; GFX9: v_pk_sub_i16 +; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} + +; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_subrev_u16_e32 + +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + %ext = sext <2 x i16> %add to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll index ec2ed78b4954..c2d04abf829f 100644 --- a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll +++ b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -1,39 +1,37 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s +; RUN: llc -march=amdgcn -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -; SI-LABEL:{{^}}row_filter_C1_D0: -; SI: s_endpgm -; Function Attrs: nounwind -define void @row_filter_C1_D0() { +; GCN-LABEL:{{^}}row_filter_C1_D0: +define amdgpu_kernel void @row_filter_C1_D0() #0 { entry: br i1 undef, label %for.inc.1, label %do.body.preheader do.body.preheader: ; preds = %entry - %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1 + %tmp = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1 br i1 undef, label %do.body56.1, label %do.body90 do.body90: ; preds = %do.body56.2, %do.body56.1, %do.body.preheader - %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ] - %2 = insertelement <4 x i32> %1, i32 undef, i32 2 - %3 = insertelement <4 x i32> %2, i32 undef, i32 3 + %tmp1 = phi <4 x i32> [ %tmp6, %do.body56.2 ], [ %tmp5, %do.body56.1 ], [ %tmp, %do.body.preheader ] + %tmp2 = insertelement <4 x i32> %tmp1, i32 undef, i32 2 + %tmp3 = insertelement <4 x i32> %tmp2, i32 undef, i32 3 br i1 undef, label %do.body124.1, label %do.body.1562.preheader do.body.1562.preheader: ; preds = %do.body124.1, %do.body90 - %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ] - %4 = insertelement <4 x i32> undef, i32 undef, i32 1 + %storemerge = phi <4 x i32> [ %tmp3, %do.body90 ], [ %tmp7, %do.body124.1 ] + %tmp4 = insertelement <4 x i32> undef, i32 undef, i32 1 br label %for.inc.1 do.body56.1: ; preds = %do.body.preheader - %5 = insertelement <4 x i32> %0, i32 undef, i32 1 + %tmp5 = insertelement <4 x i32> %tmp, i32 undef, i32 1 %or.cond472.1 = or i1 undef, undef br i1 %or.cond472.1, label %do.body56.2, label %do.body90 do.body56.2: ; preds = %do.body56.1 - %6 = insertelement <4 x i32> %5, i32 undef, i32 1 + %tmp6 = insertelement <4 x i32> %tmp5, i32 undef, i32 1 br label %do.body90 do.body124.1: ; preds = %do.body90 - %7 = insertelement <4 x i32> %3, i32 undef, i32 3 + %tmp7 = insertelement <4 x i32> %tmp3, i32 undef, i32 3 br label %do.body.1562.preheader for.inc.1: ; preds = %do.body.1562.preheader, %entry @@ -42,8 +40,8 @@ for.inc.1: ; preds = %do.body.1562.prehea unreachable } -; SI-LABEL: {{^}}foo: -; SI: s_endpgm +; GCN-LABEL: {{^}}foo: +; GCN: s_endpgm define amdgpu_ps void @foo() #0 { bb: br i1 undef, label %bb2, label %bb1 @@ -67,7 +65,7 @@ bb7: ; preds = %bb6 br label %bb4 bb9: ; preds = %bb2 - %tmp10 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp10 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp11 = extractelement <4 x float> %tmp10, i32 1 %tmp12 = extractelement <4 x float> %tmp10, i32 3 br label %bb14 @@ -78,9 +76,9 @@ bb13: ; preds = %bb2 bb14: ; preds = %bb27, %bb24, %bb9 %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ] %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ] - %tmp17 = fmul float 10.5, %tmp16 - %tmp18 = fmul float 11.5, %tmp15 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17) + %tmp17 = fmul float 1.050000e+01, %tmp16 + %tmp18 = fmul float 1.150000e+01, %tmp15 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp18, float %tmp17, float %tmp17, float %tmp17, i1 true, i1 true) #0 ret void bb23: ; preds = %bb13 @@ -97,13 +95,9 @@ bb27: ; preds = %bb24 br label %bb14 } -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index 72a1f1e25b30..35615c40d498 100644 --- a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -20,7 +20,7 @@ target triple="amdgcn--" ; CHECK-NEXT: s_mov_b32 s6, -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; CHECK-NEXT: s_endpgm -define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { +define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { entry: %v0 = insertelement <4 x float> undef, float %a0, i32 0 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 diff --git a/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll index 8bd995a8ecbb..57c267e54a14 100644 --- a/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll +++ b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll @@ -5,7 +5,7 @@ ; Just make sure this test doesn't crash. ; CHECK-LABEL: foobar: ; CHECK: s_endpgm -define void @foobar() { +define amdgpu_kernel void @foobar() { %v0 = icmp eq <4 x i32> undef, %v3 = sext <4 x i1> %v0 to <4 x i32> %v4 = extractelement <4 x i32> %v3, i32 1 diff --git a/test/CodeGen/AMDGPU/subreg-intervals.mir b/test/CodeGen/AMDGPU/subreg-intervals.mir index c4e00215159b..c477fe9bc6d3 100644 --- a/test/CodeGen/AMDGPU/subreg-intervals.mir +++ b/test/CodeGen/AMDGPU/subreg-intervals.mir @@ -10,8 +10,8 @@ # CHECK-LABEL: Machine code for function test1: --- | - define void @test0() { ret void } - define void @test1() { ret void } + define amdgpu_kernel void @test0() { ret void } + define amdgpu_kernel void @test1() { ret void } ... --- name: test0 diff --git a/test/CodeGen/AMDGPU/subreg_interference.mir b/test/CodeGen/AMDGPU/subreg_interference.mir new file mode 100644 index 000000000000..24d06a576c2a --- /dev/null +++ b/test/CodeGen/AMDGPU/subreg_interference.mir @@ -0,0 +1,24 @@ +# RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s +--- +# We should not detect any interference between v0/v1 here and only allocate +# sgpr0-sgpr3. +# +# CHECK-LABEL: func0 +# CHECK: S_NOP 0, implicit-def %sgpr0 +# CHECK: S_NOP 0, implicit-def %sgpr3 +# CHECK: S_NOP 0, implicit-def %sgpr1 +# CHECK: S_NOP 0, implicit-def %sgpr2 +# CHECK: S_NOP 0, implicit %sgpr0, implicit %sgpr3 +# CHECK: S_NOP 0, implicit %sgpr1, implicit %sgpr2 +name: func0 +body: | + bb.0: + S_NOP 0, implicit-def undef %0.sub0 : sreg_128 + S_NOP 0, implicit-def %0.sub3 + S_NOP 0, implicit-def undef %1.sub1 : sreg_128 + S_NOP 0, implicit-def %1.sub2 + + + S_NOP 0, implicit %0.sub0, implicit %0.sub3 + S_NOP 0, implicit %1.sub1, implicit %1.sub2 +... diff --git a/test/CodeGen/AMDGPU/target-cpu.ll b/test/CodeGen/AMDGPU/target-cpu.ll index cf80ff3f4c83..466e89ebee80 100644 --- a/test/CodeGen/AMDGPU/target-cpu.ll +++ b/test/CodeGen/AMDGPU/target-cpu.ll @@ -14,7 +14,7 @@ declare void @llvm.amdgcn.s.dcache.wb() #0 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]] ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 -define void @target_none() #0 { +define amdgpu_kernel void @target_none() #0 { %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* @@ -30,7 +30,7 @@ define void @target_none() #0 { ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]] ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 -define void @target_tahiti() #1 { +define amdgpu_kernel void @target_tahiti() #1 { %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* @@ -46,7 +46,7 @@ define void @target_tahiti() #1 { ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 ; CHECK: s_dcache_inv_vol -define void @target_bonaire() #3 { +define amdgpu_kernel void @target_bonaire() #3 { %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* @@ -63,7 +63,7 @@ define void @target_bonaire() #3 { ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400 ; CHECK: flat_store_dword ; CHECK: s_dcache_wb{{$}} -define void @target_fiji() #4 { +define amdgpu_kernel void @target_fiji() #4 { %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* @@ -79,7 +79,7 @@ define void @target_fiji() #4 { ; CHECK-LABEL: {{^}}promote_alloca_enabled: ; CHECK: ds_read_b32 ; CHECK: ; LDSByteSize: 5120 -define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 { +define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [5 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 @@ -93,7 +93,7 @@ entry: ; CHECK: SCRATCH_RSRC_DWORD0 ; CHECK: SCRATCH_RSRC_DWORD1 ; CHECK: ScratchSize: 24 -define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 { +define amdgpu_kernel void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 { entry: %stack = alloca [5 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll index 1555cfe39b1e..77ad895d0e86 100644 --- a/test/CodeGen/AMDGPU/trap.ll +++ b/test/CodeGen/AMDGPU/trap.ll @@ -1,13 +1,81 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-TRAP %s -; GCN: warning: :0:0: in function trap void (): trap handler not supported +; RUN: llc -mtriple=amdgcn--amdhsa -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s +; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s +; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s + +; enable trap handler feature +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s + +; disable trap handler feature +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s + +; RUN: llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s declare void @llvm.trap() #0 +declare void @llvm.debugtrap() #0 + +; MESA-TRAP: .section .AMDGPU.config +; MESA-TRAP: .long 47180 +; MESA-TRAP-NEXT: .long 208 + +; NOMESA-TRAP: .section .AMDGPU.config +; NOMESA-TRAP: .long 47180 +; NOMESA-TRAP-NEXT: .long 144 + +; GCN-LABEL: {{^}}hsa_trap: +; HSA-TRAP: enable_trap_handler = 1 +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP: s_trap 2 + +; for llvm.trap in hsa path without ABI, direct generate s_endpgm instruction without any warning information +; NO-HSA-TRAP: enable_trap_handler = 0 +; NO-HSA-TRAP: s_endpgm +; NO-HSA-TRAP: COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 + +; TRAP-BIT: enable_trap_handler = 1 +; NO-TRAP-BIT: enable_trap_handler = 0 +; NO-MESA-TRAP: s_endpgm +define amdgpu_kernel void @hsa_trap() { + call void @llvm.trap() + ret void +} + +; MESA-TRAP: .section .AMDGPU.config +; MESA-TRAP: .long 47180 +; MESA-TRAP-NEXT: .long 208 + +; NOMESA-TRAP: .section .AMDGPU.config +; NOMESA-TRAP: .long 47180 +; NOMESA-TRAP-NEXT: .long 144 + +; GCN-WARNING: warning: :0:0: in function hsa_debugtrap void (): debugtrap handler not supported +; GCN-LABEL: {{^}}hsa_debugtrap: +; HSA-TRAP: enable_trap_handler = 1 +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP: s_trap 3 + +; for llvm.debugtrap in non-hsa path without ABI, generate a warning and a s_endpgm instruction +; NO-HSA-TRAP: enable_trap_handler = 0 +; NO-HSA-TRAP: s_endpgm + +; TRAP-BIT: enable_trap_handler = 1 +; NO-TRAP-BIT: enable_trap_handler = 0 +; NO-MESA-TRAP: s_endpgm +define amdgpu_kernel void @hsa_debugtrap() { + call void @llvm.debugtrap() + ret void +} +; For non-HSA path ; GCN-LABEL: {{^}}trap: -; GCN: s_endpgm -; GCN-NEXT: s_endpgm -define void @trap() { +; TRAP-BIT: enable_trap_handler = 1 +; NO-TRAP-BIT: enable_trap_handler = 0 +; NO-HSA-TRAP: s_endpgm +; NO-MESA-TRAP: s_endpgm +define amdgpu_kernel void @trap() { call void @llvm.trap() ret void } diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll index a331475820a0..f90040385f75 100644 --- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32: ; CHECK: buffer_load_dword v ; CHECK: buffer_store_dword v -define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %bc = bitcast <2 x i32> %ld to i64 %trunc = trunc i64 %bc to i32 @@ -15,7 +15,7 @@ define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace ; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32: ; CHECK: buffer_load_dword v ; CHECK: buffer_store_dword v -define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) { %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in %bc = bitcast <3 x i32> %ld to i96 %trunc = trunc i96 %bc to i32 @@ -26,7 +26,7 @@ define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace ; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32: ; CHECK: buffer_load_dword v ; CHECK: buffer_store_dword v -define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %bc = bitcast <4 x i32> %ld to i128 %trunc = trunc i128 %bc to i32 @@ -38,7 +38,7 @@ define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspac ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16: ; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_short [[VAL]] -define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in %bc = bitcast <2 x i16> %ld to i32 %trunc = trunc i32 %bc to i16 @@ -54,7 +54,7 @@ define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace ; SI: buffer_load_dword v[[VAL:[0-9]+]] ; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]] ; CHECK: buffer_store_short [[VAL]] -define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in %bc = bitcast <4 x i16> %ld to i64 %trunc = trunc i64 %bc to i16 @@ -66,7 +66,7 @@ define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace ; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8: ; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]] ; CHECK: buffer_store_byte [[VAL]] -define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in %bc = bitcast <2 x i8> %ld to i16 %trunc = trunc i16 %bc to i8 @@ -77,7 +77,7 @@ define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* ; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8: ; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_byte [[VAL]] -define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in %bc = bitcast <4 x i8> %ld to i32 %trunc = trunc i32 %bc to i8 @@ -88,7 +88,7 @@ define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1) ; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8: ; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_byte [[VAL]] -define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) { +define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) { %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in %bc = bitcast <3 x i8> %ld to i24 %trunc = trunc i24 %bc to i8 diff --git a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll index 7a4bced9d436..cb8d36550331 100644 --- a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll +++ b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; SI: v_cmp_eq_u32_e32 vcc, 0, [[TMP]]{{$}} ; SI: v_cndmask_b32_e64 ; SI: buffer_store_byte -define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp eq i32 %ext, 0 @@ -25,7 +25,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp eq i32 %ext, 0 @@ -36,7 +36,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1: ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp eq i32 %ext, 1 @@ -48,7 +48,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp eq i32 %ext, 1 @@ -60,7 +60,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] ; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp eq i32 %ext, -1 @@ -71,7 +71,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1: ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp eq i32 %ext, -1 @@ -84,7 +84,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] ; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp ne i32 %ext, 0 @@ -96,7 +96,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp ne i32 %ext, 0 @@ -107,7 +107,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1: ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} ; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp ne i32 %ext, 1 @@ -122,7 +122,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp ne i32 %ext, 1 @@ -137,7 +137,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa ; XSI: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}} ; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]] ; XSI-NEXT: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = sext i1 %load to i32 %cmp = icmp ne i32 %ext, -1 @@ -148,7 +148,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1: ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} ; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { %load = load i1, i1 addrspace(1)* %in %ext = zext i1 %load to i32 %cmp = icmp ne i32 %ext, -1 @@ -162,7 +162,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr ; SI: v_cmp_ne_u32_e32 vcc, -1, [[LOAD]]{{$}} ; SI-NEXT: v_cndmask_b32_e64 ; SI: {{buffer|flat}}_store_byte -define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { +define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr i8, i8 addrspace(1)* %in, i32 %tid.x %load = load i8, i8 addrspace(1)* %in.ptr diff --git a/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll index 03b8af0610d7..d67b8f981b28 100644 --- a/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll +++ b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: ; GCN: s_endpgm -define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { %val = load double, double addrspace(1)* %in %cvt = fptrunc double %val to half store half %cvt, half addrspace(1)* %out @@ -11,7 +11,7 @@ define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrsp ; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16: ; GCN: s_endpgm -define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { %val = load <2 x double>, <2 x double> addrspace(1)* %in %cvt = fptrunc <2 x double> %val to <2 x half> store <2 x half> %cvt, <2 x half> addrspace(1)* %out @@ -20,7 +20,7 @@ define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 ; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16: ; GCN: s_endpgm -define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { %val = load <3 x double>, <3 x double> addrspace(1)* %in %cvt = fptrunc <3 x double> %val to <3 x half> store <3 x half> %cvt, <3 x half> addrspace(1)* %out @@ -29,7 +29,7 @@ define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 ; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16: ; GCN: s_endpgm -define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { %val = load <4 x double>, <4 x double> addrspace(1)* %in %cvt = fptrunc <4 x double> %val to <4 x half> store <4 x half> %cvt, <4 x half> addrspace(1)* %out @@ -38,7 +38,7 @@ define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 ; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16: ; GCN: s_endpgm -define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { %val = load <8 x double>, <8 x double> addrspace(1)* %in %cvt = fptrunc <8 x double> %val to <8 x half> store <8 x half> %cvt, <8 x half> addrspace(1)* %out @@ -47,7 +47,7 @@ define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 ; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16: ; GCN: s_endpgm -define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { %val = load <16 x double>, <16 x double> addrspace(1)* %in %cvt = fptrunc <16 x double> %val to <16 x half> store <16 x half> %cvt, <16 x half> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/trunc-store-i1.ll b/test/CodeGen/AMDGPU/trunc-store-i1.ll index da2a5b43dad5..4ea2352f57f3 100644 --- a/test/CodeGen/AMDGPU/trunc-store-i1.ll +++ b/test/CodeGen/AMDGPU/trunc-store-i1.ll @@ -7,7 +7,7 @@ ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] ; SI: buffer_store_byte [[VREG]], -define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind { +define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind { %trunc = trunc i32 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void @@ -15,7 +15,7 @@ define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwi ; SI-LABEL: {{^}}global_truncstore_i64_to_i1: ; SI: buffer_store_byte -define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind { +define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind { %trunc = trunc i64 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void @@ -26,13 +26,13 @@ define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwi ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] ; SI: buffer_store_byte [[VREG]], -define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { +define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { %trunc = trunc i16 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void } ; SI-LABEL: {{^}}global_truncstore_i16_to_i1: -define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind { +define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind { %add = add i16 %val0, %val1 %trunc = trunc i16 %add to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 diff --git a/test/CodeGen/AMDGPU/trunc-store.ll b/test/CodeGen/AMDGPU/trunc-store.ll index c6727e1e1273..f45de679588f 100644 --- a/test/CodeGen/AMDGPU/trunc-store.ll +++ b/test/CodeGen/AMDGPU/trunc-store.ll @@ -3,7 +3,7 @@ ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8: ; SI: buffer_store_dwordx4 -define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) { +define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) { %trunc = trunc <16 x i32> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out ret void @@ -11,7 +11,7 @@ define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8: ; SI: buffer_store_dwordx4 -define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) { +define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) { %trunc = trunc <16 x i64> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll index 878ea3f48995..3dbc10d2e9b5 100644 --- a/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll +++ b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: {{^}}test: ; CHECK: MEM_RAT_CACHELESS STORE_RAW -define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) { +define amdgpu_kernel void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) { entry: %0 = icmp eq i32 %cond, 0 br i1 %0, label %if, label %done diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll index 2c2ce4c5d351..0c91d52df0c0 100644 --- a/test/CodeGen/AMDGPU/trunc.ll +++ b/test/CodeGen/AMDGPU/trunc.ll @@ -4,7 +4,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone -define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { ; GCN-LABEL: {{^}}trunc_i64_to_i32_store: ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] @@ -28,7 +28,7 @@ define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { ; SI: buffer_store_dword [[VSHL]] ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]] -define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { %b = shl i64 %a, 2 %result = trunc i64 %b to i32 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -46,7 +46,7 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { ; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]] ; GCN: v_mov_b32_e32 ; GCN: v_mov_b32_e32 -define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { %aa = add i64 %a, 234 ; Prevent shrinking store. %b = shl i64 %aa, 2 %result = trunc i64 %b to i32 @@ -56,9 +56,8 @@ define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 } ; GCN-LABEL: {{^}}trunc_i32_to_i1: -; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; GCN: v_cmp_eq_u32 -define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { +; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}} +define amdgpu_kernel void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { %a = load i32, i32 addrspace(1)* %ptr, align 4 %trunc = trunc i32 %a to i1 %result = select i1 %trunc, i32 1, i32 0 @@ -67,9 +66,8 @@ define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { } ; GCN-LABEL: {{^}}trunc_i8_to_i1: -; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; GCN: v_cmp_eq_u32 -define void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) { +; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}} +define amdgpu_kernel void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) { %a = load i8, i8 addrspace(1)* %ptr, align 4 %trunc = trunc i8 %a to i1 %result = select i1 %trunc, i8 1, i8 0 @@ -78,9 +76,8 @@ define void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) { } ; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1: -; GCN: s_and_b32 s{{[0-9]+}}, 1, s{{[0-9]+}} -; GCN: v_cmp_eq_u32 -define void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) { +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +define amdgpu_kernel void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) { %trunc = trunc i16 %a to i1 %result = select i1 %trunc, i16 1, i16 0 store i16 %result, i16 addrspace(1)* %out, align 4 @@ -88,9 +85,8 @@ define void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) { } ; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1: -; GCN: s_and_b32 s{{[0-9]+}}, 1, s{{[0-9]+}} -; GCN: v_cmp_eq_u32 -define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { %trunc = trunc i32 %a to i1 %result = select i1 %trunc, i32 1, i32 0 store i32 %result, i32 addrspace(1)* %out, align 4 @@ -103,7 +99,7 @@ define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]] ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}} ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]] -define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { +define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { %trunc = trunc i64 %x to i1 %sel = select i1 %trunc, i32 63, i32 -12 store i32 %sel, i32 addrspace(1)* %out @@ -116,7 +112,7 @@ define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]] ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]] ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc -define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid diff --git a/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll index 76c32afc1f21..7c369a312761 100644 --- a/test/CodeGen/AMDGPU/tti-unroll-prefs.ll +++ b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll @@ -19,7 +19,7 @@ ; CHECK: store i8 0, i8 addrspace(1)* ; CHECK-NOT: store i8 0, i8 addrspace(1)* ; CHECK: ret void -define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) { +define amdgpu_kernel void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) { entry: %add = add nsw i32 %b, 4 %cmp = icmp sgt i32 %add, %a diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll index 35af7119a300..632ccaa7e612 100644 --- a/test/CodeGen/AMDGPU/uaddo.ll +++ b/test/CodeGen/AMDGPU/uaddo.ll @@ -1,19 +1,16 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s -declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}uaddo_i64_zext: -; SI: add -; SI: addc -; SI: addc +; FUNC-LABEL: {{^}}s_uaddo_i64_zext: +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: v_cmp_lt_u64_e32 vcc ; EG: ADDC_UINT ; EG: ADDC_UINT -define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind +define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 %ext = zext i1 %carry to i64 @@ -22,13 +19,16 @@ define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ret void } +; FIXME: Could do scalar + ; FUNC-LABEL: {{^}}s_uaddo_i32: -; SI: s_add_i32 +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG: ADDC_UINT ; EG: ADD_INT -define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind +define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 %carry = extractvalue { i32, i1 } %uadd, 1 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -37,14 +37,19 @@ define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 } ; FUNC-LABEL: {{^}}v_uaddo_i32: -; SI: v_add_i32 +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG: ADDC_UINT ; EG: ADD_INT -define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind +define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 %carry = extractvalue { i32, i1 } %uadd, 1 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -52,14 +57,36 @@ define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 ret void } +; FUNC-LABEL: {{^}}v_uaddo_i32_novcc: +; GCN: v_add_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]] + +; EG: ADDC_UINT +; EG: ADD_INT +define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) + %val = extractvalue { i32, i1 } %uadd, 0 + %carry = extractvalue { i32, i1 } %uadd, 1 + store volatile i32 %val, i32 addrspace(1)* %out, align 4 + call void asm sideeffect "", "~{VCC}"() #0 + store volatile i1 %carry, i1 addrspace(1)* %carryout + ret void +} + ; FUNC-LABEL: {{^}}s_uaddo_i64: -; SI: s_add_u32 -; SI: s_addc_u32 +; GCN: s_add_u32 +; GCN: s_addc_u32 ; EG: ADDC_UINT ; EG: ADD_INT -define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind +define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 { + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 store i64 %val, i64 addrspace(1)* %out, align 8 @@ -68,18 +95,48 @@ define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 } ; FUNC-LABEL: {{^}}v_uaddo_i64: -; SI: v_add_i32 -; SI: v_addc_u32 +; GCN: v_add_i32 +; GCN: v_addc_u32 ; EG: ADDC_UINT ; EG: ADD_INT -define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind +define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr + %b.gep = getelementptr inbounds i64, i64 addrspace(1)* %b.ptr + %a = load i64, i64 addrspace(1)* %a.gep + %b = load i64, i64 addrspace(1)* %b.gep + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 + store i64 %val, i64 addrspace(1)* %out + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_uaddo_i16: +; VI: v_add_u16_e32 +; VI: v_cmp_lt_u16_e32 +define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr + %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr + %a = load i16, i16 addrspace(1)* %a.gep + %b = load i16, i16 addrspace(1)* %b.gep + %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %a, i16 %b) + %val = extractvalue { i16, i1 } %uadd, 0 + %carry = extractvalue { i16, i1 } %uadd, 1 + store i16 %val, i16 addrspace(1)* %out store i1 %carry, i1 addrspace(1)* %carryout ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1 +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 +declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll index da88d2a8e8cb..2874a0cdbc05 100644 --- a/test/CodeGen/AMDGPU/udiv.ll +++ b/test/CodeGen/AMDGPU/udiv.ll @@ -1,22 +1,27 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s + +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}udiv_i32: ; EG-NOT: SETGE_INT ; EG: CF_END -define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + +; SI: v_rcp_iflag_f32_e32 +define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1) * %in - %b = load i32, i32 addrspace(1) * %b_ptr + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr %result = udiv i32 %a, %b store i32 %result, i32 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}s_udiv_i32: - -define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +; SI: v_rcp_iflag_f32_e32 +define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %result = udiv i32 %a, %b store i32 %result, i32 addrspace(1)* %out ret void @@ -30,8 +35,10 @@ define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { ; FUNC-LABEL: {{^}}udiv_v2i32: ; EG: CF_END +; SI: v_rcp_iflag_f32_e32 +; SI: v_rcp_iflag_f32_e32 ; SI: s_endpgm -define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr @@ -43,7 +50,7 @@ define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i ; FUNC-LABEL: {{^}}udiv_v4i32: ; EG: CF_END ; SI: s_endpgm -define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr @@ -56,7 +63,7 @@ define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i ; SI: buffer_load_dword [[VAL:v[0-9]+]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]] ; SI: buffer_store_dword [[RESULT]] -define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %result = udiv i32 %a, 16 @@ -70,7 +77,7 @@ define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]] ; SI: buffer_store_dword [[RESULT]] -define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %result = udiv i32 %a, 34259182 @@ -84,7 +91,7 @@ define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]] ; SI: buffer_store_dword [[RESULT]] -define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %result = udiv i32 %a, 34259183 @@ -96,7 +103,7 @@ define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI: v_rcp_f32 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}} ; SI: buffer_store_dword [[TRUNC]] -define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -110,7 +117,7 @@ define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { ; SI: v_rcp_f32 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}} ; SI: buffer_store_dword [[TRUNC]] -define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in %den = load i16, i16 addrspace(1) * %den_ptr @@ -124,7 +131,7 @@ define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { ; SI: v_rcp_f32 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}} ; SI: buffer_store_dword [[TRUNC]] -define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { +define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1 %num = load i23, i23 addrspace(1) * %in %den = load i23, i23 addrspace(1) * %den_ptr @@ -136,7 +143,7 @@ define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { ; FUNC-LABEL: {{^}}v_udiv_i24: ; SI-NOT: v_rcp_f32 -define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { +define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1 %num = load i24, i24 addrspace(1) * %in %den = load i24, i24 addrspace(1) * %den_ptr @@ -152,9 +159,42 @@ define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { ; SI: v_mul_hi_u32 ; SI: v_mul_hi_u32 -define void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { +define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 %2 = udiv <4 x i32> %1, store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 ret void } + +; FUNC-LABEL: {{^}}test_udiv2: +; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1 +define amdgpu_kernel void @test_udiv2(i32 %p) { + %i = udiv i32 %p, 2 + store volatile i32 %i, i32 addrspace(1)* undef + ret void +} + +; FUNC-LABEL: {{^}}test_udiv_3_mulhu: +; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab +; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} +; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { + %i = udiv i32 %p, 3 + store volatile i32 %i, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fdiv_test_denormals +; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) { +bb: + %tmp = load i8, i8 addrspace(1)* null, align 1 + %tmp1 = sext i8 %tmp to i32 + %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef + %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1 + %tmp4 = sext i8 %tmp3 to i32 + %tmp5 = sdiv i32 %tmp1, %tmp4 + %tmp6 = trunc i32 %tmp5 to i8 + store i8 %tmp6, i8 addrspace(1)* null, align 1 + ret void +} diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll index 17f4ebf175d9..9507a49cfc8b 100644 --- a/test/CodeGen/AMDGPU/udivrem.ll +++ b/test/CodeGen/AMDGPU/udivrem.ll @@ -51,7 +51,7 @@ ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm -define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) { +define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) { %result0 = udiv i32 %x, %y store i32 %result0, i32 addrspace(1)* %out0 %result1 = urem i32 %x, %y @@ -158,7 +158,7 @@ define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm -define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { +define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { %result0 = udiv <2 x i32> %x, %y store <2 x i32> %result0, <2 x i32> addrspace(1)* %out %result1 = urem <2 x i32> %x, %y @@ -340,7 +340,7 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3 ; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm -define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { +define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { %result0 = udiv <4 x i32> %x, %y store <4 x i32> %result0, <4 x i32> addrspace(1)* %out %result1 = urem <4 x i32> %x, %y diff --git a/test/CodeGen/AMDGPU/udivrem24.ll b/test/CodeGen/AMDGPU/udivrem24.ll index 6d145f1dbf09..6f144dcc6fd2 100644 --- a/test/CodeGen/AMDGPU/udivrem24.ll +++ b/test/CodeGen/AMDGPU/udivrem24.ll @@ -12,7 +12,7 @@ ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -31,7 +31,7 @@ define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in, align 2 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 @@ -50,7 +50,7 @@ define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -67,7 +67,7 @@ define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI: v_rcp_iflag ; SI-NOT v_rcp_f32 ; EG-NOT: RECIP_IEEE -define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -84,7 +84,7 @@ define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI: v_rcp_iflag ; SI-NOT v_rcp_f32 ; EG-NOT: RECIP_IEEE -define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -101,7 +101,7 @@ define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in ; SI: v_rcp_iflag ; SI-NOT v_rcp_f32 ; EG-NOT: RECIP_IEEE -define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -121,7 +121,7 @@ define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -141,7 +141,7 @@ define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -161,7 +161,7 @@ define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -184,7 +184,7 @@ define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 %num = load i8, i8 addrspace(1) * %in %den = load i8, i8 addrspace(1) * %den_ptr @@ -203,7 +203,7 @@ define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT -define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 %num = load i16, i16 addrspace(1) * %in, align 2 %den = load i16, i16 addrspace(1) * %den_ptr, align 2 @@ -215,7 +215,7 @@ define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { ; FUNC-LABEL: {{^}}urem24_i32: ; SI-NOT: v_rcp_f32 ; EG-NOT: RECIP_IEEE -define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -235,7 +235,7 @@ define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -255,7 +255,7 @@ define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -275,7 +275,7 @@ define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; EG-NOT: UINT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -294,7 +294,7 @@ define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], ; EG: RECIP_IEEE -define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -313,7 +313,7 @@ define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* % ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], ; EG: RECIP_IEEE -define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 diff --git a/test/CodeGen/AMDGPU/udivrem64.ll b/test/CodeGen/AMDGPU/udivrem64.ll index da61a841ff35..bd297920d563 100644 --- a/test/CodeGen/AMDGPU/udivrem64.ll +++ b/test/CodeGen/AMDGPU/udivrem64.ll @@ -70,7 +70,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = udiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -144,7 +144,7 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -159,7 +159,7 @@ define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = lshr i64 %x, 33 %2 = lshr i64 %y, 33 %result = udiv i64 %1, %2 @@ -176,7 +176,7 @@ define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = lshr i64 %x, 33 %2 = lshr i64 %y, 33 %result = urem i64 %1, %2 @@ -195,7 +195,7 @@ define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;VI-NOT: v_lshrrev_b64 ;GCN: v_mad_f32 ;GCN: s_endpgm -define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = lshr i64 %x, 41 %2 = lshr i64 %y, 41 %result = udiv i64 %1, %2 @@ -214,7 +214,7 @@ define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;VI-NOT: v_lshrrev_b64 ;GCN: v_mad_f32 ;GCN: s_endpgm -define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { %1 = lshr i64 %x, 41 %2 = lshr i64 %y, 41 %result = urem i64 %1, %2 diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index a4e18ebc9120..62943aeefbd8 100644 --- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] ; SI: buffer_store_dwordx2 [[RESULT]] -define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %val = load i64, i64 addrspace(1)* %gep, align 8 @@ -19,21 +19,21 @@ define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1) } ; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64 -define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { +define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { %cast = uitofp i64 %in to double store double %cast, double addrspace(1)* %out, align 8 ret void } ; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64 -define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) { +define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) { %cast = uitofp <2 x i64> %in to <2 x double> store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 ret void } ; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64 -define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) { +define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) { %cast = uitofp <4 x i64> %in to <4 x double> store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 ret void @@ -42,7 +42,7 @@ define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i ; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64 ; SI: v_cvt_f64_u32_e32 ; SI: s_endpgm -define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { %cast = uitofp i32 %in to double store double %cast, double addrspace(1)* %out, align 8 ret void @@ -52,7 +52,7 @@ define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { ; SI: v_cvt_f64_u32_e32 ; SI: v_cvt_f64_u32_e32 ; SI: s_endpgm -define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) { +define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) { %cast = uitofp <2 x i32> %in to <2 x double> store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 ret void @@ -64,7 +64,7 @@ define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i ; SI: v_cvt_f64_u32_e32 ; SI: v_cvt_f64_u32_e32 ; SI: s_endpgm -define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) { +define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) { %cast = uitofp <4 x i32> %in to <4 x double> store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 ret void @@ -79,7 +79,7 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}} ; SI: s_endpgm -define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 %fp = uitofp i1 %cmp to double store double %fp, double addrspace(1)* %out, align 4 @@ -91,7 +91,7 @@ define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { ; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] ; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm -define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) { +define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) { %fp = uitofp i1 %in to double store double %fp, double addrspace(1)* %out, align 8 ret void diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll index cd816b27fce6..4168326e14c6 100644 --- a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -4,7 +4,7 @@ ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f16: -define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { %result = uitofp i64 %in to half store half %result, half addrspace(1)* %out ret void @@ -24,7 +24,7 @@ define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 { ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]] ; GCN: {{buffer|flat}}_store_short {{.*}}[[VR_F16]] -define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -35,7 +35,7 @@ define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* } ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32: -define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { %result = uitofp i64 %in to float store float %result, float addrspace(1)* %out ret void @@ -54,7 +54,7 @@ define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]] -define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -65,14 +65,14 @@ define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* } ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f32: -define void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ +define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ %result = uitofp <2 x i64> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f32: -define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid @@ -83,14 +83,14 @@ define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i6 } ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f16: -define void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{ +define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{ %result = uitofp <2 x i64> %in to <2 x half> store <2 x half> %result, <2 x half> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f16: -define void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll index 3003226ca1a4..2e9918717c3a 100644 --- a/test/CodeGen/AMDGPU/uint_to_fp.ll +++ b/test/CodeGen/AMDGPU/uint_to_fp.ll @@ -6,7 +6,7 @@ ; SI: v_cvt_f32_u32_e32 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z -define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { %result = uitofp i32 %in to float store float %result, float addrspace(1)* %out ret void @@ -16,7 +16,7 @@ define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 { ; SI: v_cvt_f32_u32_e32 {{v[0-9]+}}, {{v[0-9]+$}} ; R600: INT_TO_FLT -define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -32,7 +32,7 @@ define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X -define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 { %result = uitofp <2 x i32> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out ret void @@ -49,7 +49,7 @@ define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i3 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %value = load <4 x i32>, <4 x i32> addrspace(1) * %in %result = uitofp <4 x i32> %value to <4 x float> store <4 x float> %result, <4 x float> addrspace(1)* %out @@ -66,7 +66,7 @@ define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i3 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid @@ -81,7 +81,7 @@ define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrsp ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 { %cmp = icmp eq i32 %in, 0 %fp = uitofp i1 %cmp to float store float %fp, float addrspace(1)* %out @@ -92,7 +92,7 @@ define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 { ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0 ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm -define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 { %fp = uitofp i1 %in to float store float %fp, float addrspace(1)* %out ret void @@ -105,7 +105,7 @@ define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 { ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]] ; SI: s_endpgm -define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -126,7 +126,7 @@ define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* ; R600-DAG: SETGT_UINT ; R600-DAG: SETE_INT -define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { +define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 { entry: %cvt = uitofp i64 %in to float store float %cvt, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/uitofp.f16.ll b/test/CodeGen/AMDGPU/uitofp.f16.ll index faab5ca5db73..0c3b0fcaf854 100644 --- a/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -8,7 +8,7 @@ ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @uitofp_i16_to_f16( +define amdgpu_kernel void @uitofp_i16_to_f16( half addrspace(1)* %r, i16 addrspace(1)* %a) { entry: @@ -24,7 +24,7 @@ entry: ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @uitofp_i32_to_f16( +define amdgpu_kernel void @uitofp_i32_to_f16( half addrspace(1)* %r, i32 addrspace(1)* %a) { entry: @@ -38,18 +38,23 @@ entry: ; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16 ; GCN: buffer_load_dword -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; VI: v_cvt_f32_i32_e32 -; VI: v_cvt_f32_i32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-DAG: v_and_b32_e32 -; GCN-DAG: v_lshlrev_b32_e32 -; GCN-DAG: v_or_b32_e32 -; GCN: buffer_store_dword -; GCN: s_endpgm -define void @uitofp_v2i16_to_v2f16( + +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI-DAG: v_lshlrev_b32_e32 +; SI: v_or_b32_e32 + +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f32_i32_sdwa +; VI-DAG: v_cvt_f32_i32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa +; VI: v_or_b32_e32 + +; GCN: buffer_store_dword +; GCN: s_endpgm +define amdgpu_kernel void @uitofp_v2i16_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i16> addrspace(1)* %a) { entry: @@ -61,16 +66,23 @@ entry: ; GCN-LABEL: {{^}}uitofp_v2i32_to_v2f16 ; GCN: buffer_load_dwordx2 -; GCN: v_cvt_f32_u32_e32 -; GCN: v_cvt_f32_u32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-DAG: v_and_b32_e32 -; GCN-DAG: v_lshlrev_b32_e32 -; GCN-DAG: v_or_b32_e32 + +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI-DAG: v_lshlrev_b32_e32 +; SI: v_or_b32_e32 + +; VI-DAG: v_cvt_f32_u32_e32 +; VI-DAG: v_cvt_f32_u32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI: v_or_b32_e32 + ; GCN: buffer_store_dword ; GCN: s_endpgm -define void @uitofp_v2i32_to_v2f16( +define amdgpu_kernel void @uitofp_v2i32_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i32> addrspace(1)* %a) { entry: diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll index a2e485d36225..5a579f3575fd 100644 --- a/test/CodeGen/AMDGPU/umed3.ll +++ b/test/CodeGen/AMDGPU/umed3.ll @@ -1,12 +1,13 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32: ; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 -define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -24,8 +25,8 @@ define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a ; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32: ; GCN: v_max_u32 ; GCN: v_min_u32 -define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -44,8 +45,8 @@ define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32: ; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} ; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} -define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -63,8 +64,8 @@ define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32: ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} -define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -82,8 +83,8 @@ define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64: ; GCN: v_cmp_lt_u64 ; GCN: v_cmp_gt_u64 -define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +define amdgpu_kernel void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid %a = load i64, i64 addrspace(1)* %gep0 @@ -99,9 +100,10 @@ define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a } ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16: -; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 -define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() +; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define amdgpu_kernel void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid %a = load i16, i16 addrspace(1)* %gep0 @@ -171,7 +173,7 @@ define internal i8 @umax8(i8 %x, i8 %y) #2 { ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -183,7 +185,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -195,7 +197,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -207,7 +209,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -219,7 +221,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -231,7 +233,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -243,7 +245,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -255,7 +257,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -267,7 +269,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -279,7 +281,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -291,7 +293,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -303,7 +305,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -315,7 +317,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -327,7 +329,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -339,7 +341,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -351,7 +353,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %y, i32 %x) %tmp1 = call i32 @umax(i32 %y, i32 %x) @@ -366,7 +368,7 @@ bb: ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 { bb: %tmp0 = call i16 @umin16(i16 %x, i16 %y) %tmp1 = call i16 @umax16(i16 %x, i16 %y) @@ -381,7 +383,7 @@ bb: ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 { bb: %tmp0 = call i8 @umin8(i8 %x, i8 %y) %tmp1 = call i8 @umax8(i8 %x, i8 %y) @@ -393,7 +395,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_0: ; GCN-NOT: v_med3_u32 -define void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -406,7 +408,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_1: ; GCN-NOT: v_med3_u32 -define void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -419,7 +421,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_2: ; GCN-NOT: v_med3_u32 -define void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -432,7 +434,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -445,7 +447,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 1, i32 %y) %tmp1 = call i32 @umax(i32 1, i32 %y) @@ -457,7 +459,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}} -define void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 2) %tmp1 = call i32 @umax(i32 %x, i32 2) @@ -469,7 +471,7 @@ bb: ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2: ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9 -define void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { +define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 { bb: %tmp0 = call i32 @umin(i32 %x, i32 %y) %tmp1 = call i32 @umax(i32 %x, i32 %y) @@ -479,6 +481,35 @@ bb: ret void } +; GCN-LABEL: {{^}}v_test_umed3_i16_pat_0: +; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; FIXME: VI not matching med3 +; VI: v_min_u16 +; VI: v_max_u16 +; VI: v_min_u16 +; VI: v_max_u16 + +; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid + %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3 + %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8 + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %x = load i16, i16 addrspace(1)* %gep0 + %y = load i16, i16 addrspace(1)* %gep1 + %z = load i16, i16 addrspace(1)* %gep2 + + %tmp0 = call i16 @umin16(i16 %x, i16 %y) + %tmp1 = call i16 @umax16(i16 %x, i16 %y) + %tmp2 = call i16 @umin16(i16 %tmp1, i16 %z) + %tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2) + store i16 %tmp3, i16 addrspace(1)* %out.gep + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind readnone alwaysinline } diff --git a/test/CodeGen/AMDGPU/unaligned-load-store.ll b/test/CodeGen/AMDGPU/unaligned-load-store.ll index 0f76a54975e6..68aacd084bf9 100644 --- a/test/CodeGen/AMDGPU/unaligned-load-store.ll +++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -8,7 +8,7 @@ ; SI: ds_write_b8 ; SI: ds_write_b8 ; SI: s_endpgm -define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 { %v = load i16, i16 addrspace(3)* %p, align 1 store i16 %v, i16 addrspace(3)* %r, align 1 ret void @@ -23,7 +23,7 @@ define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace( ; UNALIGNED: buffer_load_ushort ; UNALIGNED: buffer_store_short ; SI: s_endpgm -define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 { %v = load i16, i16 addrspace(1)* %p, align 1 store i16 %v, i16 addrspace(1)* %r, align 1 ret void @@ -42,7 +42,7 @@ define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace ; SI: ds_write_b8 ; SI: ds_write_b8 ; SI: s_endpgm -define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { %v = load i32, i32 addrspace(3)* %p, align 1 store i32 %v, i32 addrspace(3)* %r, align 1 ret void @@ -60,7 +60,7 @@ define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace( ; UNALIGNED: buffer_load_dword ; UNALIGNED: buffer_store_dword -define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { %v = load i32, i32 addrspace(1)* %p, align 1 store i32 %v, i32 addrspace(1)* %r, align 1 ret void @@ -74,7 +74,7 @@ define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace ; UNALIGNED: buffer_load_dword ; UNALIGNED: buffer_store_dword -define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { %v = load i32, i32 addrspace(1)* %p, align 2 store i32 %v, i32 addrspace(1)* %r, align 2 ret void @@ -85,7 +85,7 @@ define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1) ; GCN: ds_read_u16 ; GCN: ds_write_b16 ; GCN: ds_write_b16 -define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { %v = load i32, i32 addrspace(3)* %p, align 2 store i32 %v, i32 addrspace(3)* %r, align 2 ret void @@ -132,7 +132,7 @@ define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* ; SI-NOT: v_lshl ; SI: ds_write_b8 ; SI: s_endpgm -define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 { %v = load i64, i64 addrspace(3)* %p, align 1 store i64 %v, i64 addrspace(3)* %r, align 1 ret void @@ -179,7 +179,7 @@ define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace( ; SI-NOT: v_lshl ; SI: ds_write_b8 ; SI: s_endpgm -define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 { %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1 store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1 ret void @@ -209,7 +209,7 @@ define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i ; UNALIGNED: buffer_load_dwordx2 ; UNALIGNED: buffer_store_dwordx2 -define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { %v = load i64, i64 addrspace(1)* %p, align 2 store i64 %v, i64 addrspace(1)* %r, align 2 ret void @@ -239,7 +239,7 @@ define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1) ; UNALIGNED: buffer_load_dwordx2 ; UNALIGNED: buffer_store_dwordx2 -define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { +define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { %v = load i64, i64 addrspace(1)* %p, align 1 store i64 %v, i64 addrspace(1)* %r, align 1 ret void @@ -286,7 +286,7 @@ define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace ; GCN: ds_write_b8 ; GCN: ds_write_b8 ; GCN: s_endpgm -define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 { +define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 { %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 ret void @@ -329,7 +329,7 @@ define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i ; UNALIGNED: buffer_load_dwordx4 ; UNALIGNED: buffer_store_dwordx4 -define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 { +define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 { %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 ret void @@ -337,7 +337,7 @@ define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x ; FUNC-LABEL: {{^}}local_load_i64_align_4: ; GCN: ds_read2_b32 -define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { %val = load i64, i64 addrspace(3)* %in, align 4 store i64 %val, i64 addrspace(1)* %out, align 8 ret void @@ -345,7 +345,7 @@ define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrsp ; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 -define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 %val = load i64, i64 addrspace(3)* %ptr, align 4 store i64 %val, i64 addrspace(1)* %out, align 8 @@ -356,7 +356,7 @@ define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 ; GCN: s_endpgm -define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* @@ -375,7 +375,7 @@ define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocaptur ; GCN: ds_read_u8 ; GCN: ds_read_u8 ; GCN: store_dwordx2 -define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { %val = load i64, i64 addrspace(3)* %in, align 1 store i64 %val, i64 addrspace(1)* %out, align 8 ret void @@ -383,7 +383,7 @@ define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrsp ; FUNC-LABEL: {{^}}local_store_i64_align_4: ; GCN: ds_write2_b32 -define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { +define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { store i64 %val, i64 addrspace(3)* %out, align 4 ret void } @@ -391,7 +391,7 @@ define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { ; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 ; GCN: s_endpgm -define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { +define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 store i64 0, i64 addrspace(3)* %ptr, align 4 ret void @@ -401,7 +401,7 @@ define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; GCN: s_endpgm -define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { +define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* @@ -418,7 +418,7 @@ define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) # ; UNALIGNED: s_load_dword ; SI: buffer_store_dword -define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { %v = load i32, i32 addrspace(2)* %p, align 1 store i32 %v, i32 addrspace(1)* %r, align 4 ret void @@ -430,7 +430,7 @@ define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* ; UNALIGNED: s_load_dword ; UNALIGNED: buffer_store_dword -define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { %v = load i32, i32 addrspace(2)* %p, align 2 store i32 %v, i32 addrspace(1)* %r, align 4 ret void @@ -444,7 +444,7 @@ define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r ; UNALIGNED: s_load_dwordx2 ; UNALIGNED: buffer_store_dwordx2 -define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { %v = load i64, i64 addrspace(2)* %p, align 2 store i64 %v, i64 addrspace(1)* %r, align 4 ret void @@ -453,7 +453,7 @@ define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r ; SI-LABEL: {{^}}constant_align4_load_i64: ; SI: s_load_dwordx2 ; SI: buffer_store_dwordx2 -define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { %v = load i64, i64 addrspace(2)* %p, align 4 store i64 %v, i64 addrspace(1)* %r, align 4 ret void @@ -462,7 +462,7 @@ define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r ; SI-LABEL: {{^}}constant_align4_load_v4i32: ; SI: s_load_dwordx4 ; SI: buffer_store_dwordx4 -define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 ret void @@ -482,7 +482,7 @@ define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> ad ; UNALIGNED: buffer_load_dwordx2 ; SI: buffer_store_dwordx2 -define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 { %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1 store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4 ret void @@ -512,7 +512,7 @@ define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> ; UNALIGNED: buffer_load_dwordx4 ; SI: buffer_store_dwordx4 -define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 ret void @@ -521,7 +521,7 @@ define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> ; SI-LABEL: {{^}}constant_align4_load_i8: ; SI: buffer_load_ubyte ; SI: buffer_store_byte -define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { %v = load i8, i8 addrspace(2)* %p, align 4 store i8 %v, i8 addrspace(1)* %r, align 4 ret void @@ -530,7 +530,7 @@ define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) # ; SI-LABEL: {{^}}constant_align2_load_i8: ; SI: buffer_load_ubyte ; SI: buffer_store_byte -define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { %v = load i8, i8 addrspace(2)* %p, align 2 store i8 %v, i8 addrspace(1)* %r, align 2 ret void @@ -541,7 +541,7 @@ define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) # ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]] ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { +define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1 %v0 = load i32, i32 addrspace(2)* %p, align 4 %v1 = load i32, i32 addrspace(2)* %gep0, align 4 @@ -571,7 +571,7 @@ define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspac ; SI: ds_read_u8 ; SI: ScratchSize: 0{{$}} -define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 { +define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 { %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1 store <16 x i8> %ld, <16 x i8> addrspace(1)* %out ret void @@ -596,7 +596,7 @@ define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> add ; SI: ds_write_b8 ; SI: ScratchSize: 0{{$}} -define void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 { +define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 { store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1 ret void } diff --git a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 4902e9a3cafb..3e80fcf85b52 100644 --- a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: {{^}}func: -define void @func() #0 { +define amdgpu_kernel void @func() #0 { B0: br i1 undef, label %B1, label %B2 @@ -35,7 +35,8 @@ bb: %tmp1 = load volatile i32, i32 addrspace(1)* undef, align 4 %tmp2 = insertelement <4 x i32> undef, i32 %tmp1, i32 0 %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1 - %tmp4 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp3, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp3.cast = bitcast <4 x i32> %tmp3 to <4 x float> + %tmp4 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp3.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp5 = extractelement <4 x float> %tmp4, i32 0 %tmp6 = fmul float %tmp5, undef %tmp7 = fadd float %tmp6, %tmp6 @@ -71,7 +72,7 @@ bb11: ; preds = %bb9 ; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}} -define void @partially_undef_copy() #0 { +define amdgpu_kernel void @partially_undef_copy() #0 { %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"() %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"() @@ -83,8 +84,7 @@ define void @partially_undef_copy() #0 { ret void } -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare float @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll index d96ee6d21ce8..60ab7631a101 100644 --- a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll +++ b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll @@ -5,7 +5,7 @@ ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable. ; COMMON-LABEL: {{^}}branch_true: -define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +define amdgpu_kernel void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { entry: br i1 true, label %for.end, label %for.body.lr.ph @@ -42,7 +42,7 @@ for.end: ; preds = %for.body, %entry ; SI: s_cbranch_vccnz ; SI: s_cbranch_scc1 ; SI: s_endpgm -define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +define amdgpu_kernel void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { entry: br i1 false, label %for.end, label %for.body.lr.ph @@ -79,7 +79,7 @@ for.end: ; preds = %for.body, %entry ; SI: s_cbranch_scc1 ; SI: s_cbranch_scc1 ; SI: s_endpgm -define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +define amdgpu_kernel void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { entry: br i1 undef, label %for.end, label %for.body.lr.ph diff --git a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll index 93a2c6998be4..eb6007f21c10 100644 --- a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll +++ b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll @@ -14,6 +14,7 @@ main_body: if: %u = fadd float %v, %v + call void asm sideeffect "", ""() #0 ; Prevent ifconversion br label %else else: diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll index 154ac361e797..a9d45d71fa2e 100644 --- a/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}uniform_if_scc: ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0 @@ -12,7 +12,7 @@ ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %else @@ -40,7 +40,7 @@ done: ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) { entry: %cmp0 = fcmp oeq float %cond, 0.0 br i1 %cmp0, label %if, label %else @@ -68,7 +68,7 @@ done: ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %else, label %if @@ -96,7 +96,7 @@ done: ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) { entry: %cmp0 = fcmp oeq float %cond, 0.0 br i1 %cmp0, label %else, label %if @@ -123,7 +123,7 @@ done: ; GCN: buffer_store_dword ; GCN: [[ENDIF_LABEL]]: ; GCN: s_endpgm -define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) { +define amdgpu_kernel void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) { entry: %a.0 = fadd float %a, 10.0 %cond = bitcast float %a.0 to i32 @@ -148,7 +148,7 @@ endif: ; GCN: buffer_store_dword ; GCN: [[ENDIF_LABEL]]: ; GCN: s_endpgm -define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) { +define amdgpu_kernel void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) { entry: %a.0 = fadd float %a, 10.0 %cond = bitcast float %a.0 to i32 @@ -166,7 +166,7 @@ endif: ; GCN-LABEL: {{^}}uniform_if_else_ret: ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] +; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; GCN: buffer_store_dword [[TWO]] @@ -176,7 +176,7 @@ endif: ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] ; GCN: s_endpgm -define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) { +define amdgpu_kernel void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) { entry: %cmp = icmp eq i32 %a, 0 br i1 %cmp, label %if.then, label %if.else @@ -209,7 +209,7 @@ if.end: ; preds = %if.else, %if.then ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 ; GCN: buffer_store_dword [[THREE]] ; GCN: s_endpgm -define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) { +define amdgpu_kernel void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) { entry: %cmp = icmp eq i32 %a, 0 br i1 %cmp, label %if.then, label %if.else @@ -233,7 +233,7 @@ if.end: ; preds = %if.else, %if.then ; GCN: buffer_store_dword ; GCN: [[LABEL]]: ; GCN: s_endpgm -define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { main_body: %0 = icmp sgt i32 %cond, 0 %1 = sext i1 %0 to i32 @@ -252,11 +252,13 @@ ENDIF: ; preds = %IF, %main_body ; GCN: s_cmp_lt_i32 [[COND]], 1 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]] ; GCN: v_cmp_gt_i32_e64 vcc, [[COND]], 0{{$}} -; GCN: s_cbranch_vccnz [[EXIT]] -; GCN: buffer_store +; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]] ; GCN: {{^}}[[EXIT]]: ; GCN: s_endpgm -define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) { +; GCN: {{^}}[[BODY]]: +; GCN: buffer_store +; GCN: s_endpgm +define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %cmp0 = icmp sgt i32 %cond0, 0 @@ -282,7 +284,7 @@ bb9: ; preds = %bb8, %bb4 ; SI: s_cmp_lg_u32 [[I]], 0 ; SI: s_cbranch_scc1 [[LOOP_LABEL]] ; SI: s_endpgm -define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @uniform_loop(i32 addrspace(1)* %out, i32 %a) { entry: br label %loop @@ -302,12 +304,13 @@ done: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] -; GCN: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]] ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0 -; GCN: s_cbranch_scc1 [[ENDIF_LABEL]] +; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]] +; GCN: s_endpgm +; GCN: {{^}}[[IF_UNIFORM_LABEL]]: ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] -define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %d_cmp = icmp ult i32 %tid, 16 @@ -328,15 +331,14 @@ endif: ; GCN-LABEL: {{^}}divergent_inside_uniform: ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] +; GCN: [[IF_LABEL]]: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] -; GCN: [[ENDIF_LABEL]]: -; GCN: s_endpgm -define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { entry: %u_cmp = icmp eq i32 %cond, 0 br i1 %u_cmp, label %if, label %endif @@ -363,12 +365,12 @@ endif: ; GCN: buffer_store_dword [[ONE]] ; GCN: s_or_b64 exec, exec, [[MASK]] ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]] +; GCN: s_cbranch_scc0 [[IF_UNIFORM:[A-Z0-9_]+]] +; GCN: s_endpgm +; GCN: [[IF_UNIFORM]]: ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; GCN: buffer_store_dword [[TWO]] -; GCN: [[EXIT]]: -; GCN: s_endpgm -define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %d_cmp = icmp eq i32 %tid, 0 @@ -408,7 +410,7 @@ exit: ; GCN: BB[[FNNUM]]_3: ; GCN: s_endpgm -define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp1 = icmp sgt i32 %cond, 0 @@ -443,7 +445,7 @@ bb9: ; preds = %bb8, %bb4 ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp eq i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -475,7 +477,7 @@ done: ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp ne i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -503,7 +505,7 @@ done: ; GCN: [[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]] ; GCN: buffer_store_dword [[V_VAL]] -define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) { +define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) { entry: %cmp0 = icmp sgt i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -522,7 +524,7 @@ done: ; GCN-LABEL: {{^}}move_to_valu_i64_eq: ; GCN: v_cmp_eq_u64_e32 -define void @move_to_valu_i64_eq(i32 addrspace(1)* %out) { +define amdgpu_kernel void @move_to_valu_i64_eq(i32 addrspace(1)* %out) { %cond = load volatile i64, i64 addrspace(3)* undef %cmp0 = icmp eq i64 %cond, 0 br i1 %cmp0, label %if, label %else @@ -541,7 +543,7 @@ done: ; GCN-LABEL: {{^}}move_to_valu_i64_ne: ; GCN: v_cmp_ne_u64_e32 -define void @move_to_valu_i64_ne(i32 addrspace(1)* %out) { +define amdgpu_kernel void @move_to_valu_i64_ne(i32 addrspace(1)* %out) { %cond = load volatile i64, i64 addrspace(3)* undef %cmp0 = icmp ne i64 %cond, 0 br i1 %cmp0, label %if, label %else diff --git a/test/CodeGen/AMDGPU/uniform-crash.ll b/test/CodeGen/AMDGPU/uniform-crash.ll index cfbb2af58677..028199ef9de7 100644 --- a/test/CodeGen/AMDGPU/uniform-crash.ll +++ b/test/CodeGen/AMDGPU/uniform-crash.ll @@ -6,7 +6,7 @@ ; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]] ; GCN: [[LABEL]]: ; GCN-NEXT: s_endpgm -define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { +define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { main_body: %0 = icmp sgt i32 %cond, 0 %1 = sext i1 %0 to i32 @@ -25,7 +25,7 @@ ENDIF: ; preds = %IF, %main_body ; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]: ; GCN: s_cbranch_scc1 [[LOOP]] ; GCN: {{^}}[[BB0]]: -define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1) { +define amdgpu_kernel void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1) { bb: %cnd = trunc i32 %arg to i1 br i1 %cnd, label %bb2, label %bb5 diff --git a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll index 2c3a09818860..8a08f9d8bb0d 100644 --- a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -7,11 +7,11 @@ ; CHECK: s_and_saveexec_b64 ; CHECK-NEXT: s_xor_b64 ; CHECK-NEXT: ; mask branch - +; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}} ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader ; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: -; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]] +; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]] ; CHECK: s_endpgm define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) { @@ -38,7 +38,7 @@ out: ; CHECK-NEXT: s_xor_b64 ; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz -define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { main_body: %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %cc = icmp eq i32 %tid, 0 diff --git a/test/CodeGen/AMDGPU/unify-metadata.ll b/test/CodeGen/AMDGPU/unify-metadata.ll index 9549b08ffee1..d96583e71f13 100644 --- a/test/CodeGen/AMDGPU/unify-metadata.ll +++ b/test/CodeGen/AMDGPU/unify-metadata.ll @@ -14,10 +14,6 @@ ; ALL-DAG: ![[USED_EXT_1]] = !{!"cl_khr_fp16"} ; ALL-DAG: ![[USED_EXT_2]] = !{!"cl_doubles"} -define void @test() { - ret void -} - !opencl.ocl.version = !{!1, !0, !0, !0} !llvm.ident = !{!2, !2, !2, !2, !6} !opencl.used.extensions = !{!3, !3, !4, !5} diff --git a/test/CodeGen/AMDGPU/unigine-liveness-crash.ll b/test/CodeGen/AMDGPU/unigine-liveness-crash.ll index 732790ceb335..853131baed5e 100644 --- a/test/CodeGen/AMDGPU/unigine-liveness-crash.ll +++ b/test/CodeGen/AMDGPU/unigine-liveness-crash.ll @@ -1,5 +1,4 @@ -; RUN: llc -march=amdgcn < %s | FileCheck %s -; REQUIRES: asserts +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s ; ; This test used to crash with the following assertion: ; llc: include/llvm/ADT/IntervalMap.h:632: unsigned int llvm::IntervalMapImpl::LeafNode >::insertFrom(unsigned int &, unsigned int, KeyT, KeyT, ValT) [KeyT = llvm::SlotIndex, ValT = llvm::LiveInterval *, N = 8, Traits = llvm::IntervalMapInfo]: Assertion `(i == Size || Traits::stopLess(b, start(i))) && "Overlapping insert"' failed. @@ -10,31 +9,33 @@ ; ; Check for a valid output. ; CHECK: image_sample_c - -target triple = "amdgcn--" - -@ddxy_lds = external addrspace(3) global [64 x i32] - define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 { main_body: - %tmp = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg6, <2 x i32> %arg8) - %tmp23 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %i.i = extractelement <2 x i32> %arg8, i32 0 + %j.i = extractelement <2 x i32> %arg8, i32 1 + %i.f.i = bitcast i32 %i.i to float + %j.f.i = bitcast i32 %j.i to float + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 3, i32 4, i32 %arg6) #2 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 3, i32 4, i32 %arg6) #2 + %tmp23 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp24 = extractelement <4 x float> %tmp23, i32 3 %tmp25 = fmul float %tmp24, undef - %tmp26 = fmul float undef, %tmp + %tmp26 = fmul float undef, %p2.i %tmp27 = fadd float %tmp26, undef %tmp28 = bitcast float %tmp27 to i32 %tmp29 = insertelement <4 x i32> undef, i32 %tmp28, i32 0 %tmp30 = insertelement <4 x i32> %tmp29, i32 0, i32 1 %tmp31 = insertelement <4 x i32> %tmp30, i32 undef, i32 2 - %tmp32 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp31, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp31.cast = bitcast <4 x i32> %tmp31 to <4 x float> + %tmp32 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp31.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp33 = extractelement <4 x float> %tmp32, i32 0 %tmp34 = fadd float undef, %tmp33 %tmp35 = fadd float %tmp34, undef %tmp36 = fadd float %tmp35, undef %tmp37 = fadd float %tmp36, undef %tmp38 = fadd float %tmp37, undef - %tmp39 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp39 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp40 = extractelement <4 x float> %tmp39, i32 0 %tmp41 = extractelement <4 x float> %tmp39, i32 1 %tmp42 = extractelement <4 x float> %tmp39, i32 2 @@ -51,7 +52,8 @@ main_body: %tmp53 = insertelement <4 x i32> undef, i32 %tmp50, i32 0 %tmp54 = insertelement <4 x i32> %tmp53, i32 %tmp51, i32 1 %tmp55 = insertelement <4 x i32> %tmp54, i32 %tmp52, i32 2 - %tmp56 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp55, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp55.cast = bitcast <4 x i32> %tmp55 to <4 x float> + %tmp56 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp55.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp57 = extractelement <4 x float> %tmp56, i32 0 %tmp58 = fadd float %tmp38, %tmp57 %tmp59 = fadd float undef, %tmp46 @@ -60,7 +62,8 @@ main_body: %tmp62 = bitcast float %tmp60 to i32 %tmp63 = insertelement <4 x i32> undef, i32 %tmp61, i32 1 %tmp64 = insertelement <4 x i32> %tmp63, i32 %tmp62, i32 2 - %tmp65 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp64, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp64.cast = bitcast <4 x i32> %tmp64 to <4 x float> + %tmp65 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp64.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp66 = extractelement <4 x float> %tmp65, i32 0 %tmp67 = fadd float %tmp58, %tmp66 %tmp68 = fmul float %tmp67, 1.250000e-01 @@ -76,8 +79,9 @@ IF26: ; preds = %main_body ENDIF25: ; preds = %IF29, %main_body %.4 = phi float [ %tmp84, %IF29 ], [ %tmp68, %main_body ] %tmp73 = fadd float %.4, undef - %tmp74 = call float @llvm.AMDGPU.clamp.(float %tmp73, float 0.000000e+00, float 1.000000e+00) - %tmp75 = fmul float undef, %tmp74 + %max.0.i = call float @llvm.maxnum.f32(float %tmp73, float 0.000000e+00) + %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00) + %tmp75 = fmul float undef, %clamp.i %tmp76 = fmul float %tmp75, undef %tmp77 = fadd float %tmp76, undef %tmp78 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp77, 11 @@ -99,17 +103,22 @@ IF29: ; preds = %LOOP ENDIF28: ; preds = %LOOP %tmp85 = insertelement <4 x i32> %tmp72, i32 undef, i32 1 %tmp86 = insertelement <4 x i32> %tmp85, i32 undef, i32 2 - %tmp87 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp86, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp86.cast = bitcast <4 x i32> %tmp86 to <4 x float> + %tmp87 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp86.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp88 = extractelement <4 x float> %tmp87, i32 0 %tmp89 = fadd float undef, %tmp88 br label %LOOP } -declare float @llvm.AMDGPU.clamp.(float, float, float) #1 -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" } +attributes #0 = { nounwind "InitialPSInputAddr"="36983" "target-cpu"="tonga" } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } +attributes #3 = { nounwind } diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll index 941f4c601e34..25a700a943d2 100644 --- a/test/CodeGen/AMDGPU/unknown-processor.ll +++ b/test/CodeGen/AMDGPU/unknown-processor.ll @@ -13,7 +13,7 @@ ; GCN: ScratchSize: 8{{$}} ; R600: MOV -define void @foo() { +define amdgpu_kernel void @foo() { %alloca = alloca i32, align 4 store volatile i32 0, i32* %alloca ret void diff --git a/test/CodeGen/AMDGPU/unroll.ll b/test/CodeGen/AMDGPU/unroll.ll index 411a15a4b839..2ce4de90a02d 100644 --- a/test/CodeGen/AMDGPU/unroll.ll +++ b/test/CodeGen/AMDGPU/unroll.ll @@ -6,10 +6,10 @@ ; private memory. We want to make sure these kinds of loops are always ; unrolled, because private memory is slow. -; CHECK-LABEL: @test +; CHECK-LABEL: @private_memory ; CHECK-NOT: alloca ; CHECK: store i32 5, i32 addrspace(1)* %out -define void @test(i32 addrspace(1)* %out) { +define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out) { entry: %0 = alloca [32 x i32] br label %loop.header @@ -34,3 +34,67 @@ exit: store i32 %3, i32 addrspace(1)* %out ret void } + +; Check that loop is unrolled for local memory references + +; CHECK-LABEL: @local_memory +; CHECK: getelementptr i32, i32 addrspace(1)* %out, i32 128 +; CHECK-NEXT: store +; CHECK-NEXT: ret +define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) { +entry: + br label %loop.header + +loop.header: + %counter = phi i32 [0, %entry], [%inc, %loop.inc] + br label %loop.body + +loop.body: + %ptr_lds = getelementptr i32, i32 addrspace(3)* %lds, i32 %counter + %val = load i32, i32 addrspace(3)* %ptr_lds + %ptr_out = getelementptr i32, i32 addrspace(1)* %out, i32 %counter + store i32 %val, i32 addrspace(1)* %ptr_out + br label %loop.inc + +loop.inc: + %inc = add i32 %counter, 1 + %cond = icmp sge i32 %counter, 128 + br i1 %cond, label %exit, label %loop.header + +exit: + ret void +} + +; Check that a loop with if inside completely unrolled to eliminate phi and if + +; CHECK-LABEL: @unroll_for_if +; CHECK: entry: +; CHECK-NEXT: getelementptr +; CHECK-NEXT: store +; CHECK-NEXT: getelementptr +; CHECK-NEXT: store +; CHECK-NOT: br +define amdgpu_kernel void @unroll_for_if(i32* %a) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %i1 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %and = and i32 %i1, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %0 = sext i32 %i1 to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %0 + store i32 0, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i32 %i1, 1 + %cmp = icmp ult i32 %inc, 48 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.cond + ret void +} diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll index d120111a71fb..68e91e8c9c6b 100644 --- a/test/CodeGen/AMDGPU/unsupported-cc.ll +++ b/test/CodeGen/AMDGPU/unsupported-cc.ll @@ -6,7 +6,7 @@ ; CHECK: LSHR ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 5(7.006492e-45) -define void @slt(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @slt(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp slt i32 %in, 5 %1 = select i1 %0, i32 -1, i32 0 @@ -18,7 +18,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 5(7.006492e-45) -define void @ult_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @ult_i32(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp ult i32 %in, 5 %1 = select i1 %0, i32 -1, i32 0 @@ -31,7 +31,7 @@ entry: ; CHECK-NEXT: 1084227584(5.000000e+00) ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 ; CHECK-NEXT: LSHR * -define void @ult_float(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ult_float(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ult float %in, 5.0 %1 = select i1 %0, float 1.0, float 0.0 @@ -43,7 +43,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}} ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @ult_float_native(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ult_float_native(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ult float %in, 5.0 %1 = select i1 %0, float 0.0, float 1.0 @@ -55,7 +55,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @olt(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @olt(float addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 5.0 %1 = select i1 %0, float 1.0, float 0.0 @@ -67,7 +67,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 6(8.407791e-45) -define void @sle(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @sle(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp sle i32 %in, 5 %1 = select i1 %0, i32 -1, i32 0 @@ -79,7 +79,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT: 6(8.407791e-45) -define void @ule_i32(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @ule_i32(i32 addrspace(1)* %out, i32 %in) { entry: %0 = icmp ule i32 %in, 5 %1 = select i1 %0, i32 -1, i32 0 @@ -92,7 +92,7 @@ entry: ; CHECK-NEXT: 1084227584(5.000000e+00) ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 ; CHECK-NEXT: LSHR * -define void @ule_float(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ule_float(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ule float %in, 5.0 %1 = select i1 %0, float 1.0, float 0.0 @@ -104,7 +104,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}} ; CHECK-NEXT: 1084227584(5.000000e+00) -define void @ule_float_native(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ule_float_native(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ule float %in, 5.0 %1 = select i1 %0, float 0.0, float 1.0 @@ -116,7 +116,7 @@ entry: ; CHECK: LSHR ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z ; CHECK-NEXT:1084227584(5.000000e+00) -define void @ole(float addrspace(1)* %out, float %in) { +define amdgpu_kernel void @ole(float addrspace(1)* %out, float %in) { entry: %0 = fcmp ole float %in, 5.0 %1 = select i1 %0, float 1.0, float 0.0 diff --git a/test/CodeGen/AMDGPU/urecip.ll b/test/CodeGen/AMDGPU/urecip.ll deleted file mode 100644 index d58d2dc2d963..000000000000 --- a/test/CodeGen/AMDGPU/urecip.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK: v_rcp_iflag_f32_e32 - -define void @test(i32 %p, i32 %q) { - %i = udiv i32 %p, %q - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll index 9e2cfa34e0b9..fd7f8fa2efab 100644 --- a/test/CodeGen/AMDGPU/urem.ll +++ b/test/CodeGen/AMDGPU/urem.ll @@ -9,7 +9,7 @@ ; FUNC-LABEL: {{^}}test_urem_i32: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in %b = load i32, i32 addrspace(1)* %b_ptr @@ -26,7 +26,7 @@ define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; SI: v_sub_i32 ; SI: buffer_store_dword ; SI: s_endpgm -define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %num = load i32, i32 addrspace(1) * %in %result = urem i32 %num, 7 store i32 %result, i32 addrspace(1)* %out @@ -36,7 +36,7 @@ define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; FUNC-LABEL: {{^}}test_urem_v2i32: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr @@ -48,7 +48,7 @@ define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1 ; FUNC-LABEL: {{^}}test_urem_v4i32: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr @@ -60,7 +60,7 @@ define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1 ; FUNC-LABEL: {{^}}test_urem_i64: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 %a = load i64, i64 addrspace(1)* %in %b = load i64, i64 addrspace(1)* %b_ptr @@ -72,7 +72,7 @@ define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; FUNC-LABEL: {{^}}test_urem_v2i64: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr @@ -84,7 +84,7 @@ define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1 ; FUNC-LABEL: {{^}}test_urem_v4i64: ; SI: s_endpgm ; EG: CF_END -define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { +define amdgpu_kernel void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll index 82bdc261b112..f8e6b7edfe35 100644 --- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -11,7 +11,7 @@ declare float @llvm.amdgcn.div.fixup.f32(float, float, float) #1 ; GCN: s_load_dword [[SGPR:s[0-9]+]], ; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 { %dbl = fadd float %a, %a store float %dbl, float addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 { ; GCN: s_load_dword [[SGPR:s[0-9]+]], ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -35,7 +35,7 @@ define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -58,7 +58,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] -define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 { %va0 = load volatile float, float addrspace(1)* %in %va1 = load volatile float, float addrspace(1)* %in %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1 @@ -76,7 +76,7 @@ define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -90,7 +90,7 @@ define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, floa ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -100,7 +100,7 @@ define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, floa ; GCN: s_load_dword [[SGPR:s[0-9]+]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0 ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -110,7 +110,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, fl ; GCN: s_load_dword [[SGPR:s[0-9]+]] ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -121,7 +121,7 @@ define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, fl ; GCN: s_load_dword [[SGPR:s[0-9]+]] ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 { %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1 store float %val, float addrspace(1)* %out, align 4 ret void @@ -132,7 +132,7 @@ define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, fl ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1 store float %fma, float addrspace(1)* %out, align 4 ret void @@ -143,7 +143,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, f ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 ; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]] ; GCN: buffer_store_dword [[RESULT0]] -define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1 store float %fma, float addrspace(1)* %out ret void @@ -158,7 +158,7 @@ define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, f ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm -define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 { %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1 %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1 store volatile float %fma0, float addrspace(1)* %out @@ -171,7 +171,7 @@ define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1 store float %fma, float addrspace(1)* %out ret void @@ -186,7 +186,7 @@ define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, f ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm -define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1 %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1 store volatile float %fma0, float addrspace(1)* %out @@ -199,7 +199,7 @@ define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]] ; GCN: buffer_store_dword [[RESULT]] -define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 { %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1 store float %fma, float addrspace(1)* %out ret void @@ -214,7 +214,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, f ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] ; GCN: s_endpgm -define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 { %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1 %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1 store volatile float %fma0, float addrspace(1)* %out @@ -234,7 +234,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] -define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 { +define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 { %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1 %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1 store volatile float %fma0, float addrspace(1)* %out @@ -259,7 +259,7 @@ define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 { ; GCN: buffer_store_dwordx2 [[RESULT0]] ; GCN: buffer_store_dwordx2 [[RESULT1]] -define void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 { +define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 { %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1 %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1 store volatile double %fma0, double addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll index 3c9b1622a076..d1f454f0bc65 100644 --- a/test/CodeGen/AMDGPU/usubo.ll +++ b/test/CodeGen/AMDGPU/usubo.ll @@ -1,16 +1,16 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s -declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}usubo_i64_zext: +; FUNC-LABEL: {{^}}s_usubo_i64_zext: +; GCN: s_sub_u32 +; GCN: s_subb_u32 +; GCN: v_cmp_gt_u64_e32 vcc ; EG: SUBB_UINT ; EG: ADDC_UINT -define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind +define amdgpu_kernel void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0 %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 %ext = zext i1 %carry to i64 @@ -19,13 +19,16 @@ define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ret void } +; FIXME: Could do scalar + ; FUNC-LABEL: {{^}}s_usubo_i32: -; SI: s_sub_i32 +; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind +define amdgpu_kernel void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { + %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 %carry = extractvalue { i32, i1 } %usub, 1 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -34,14 +37,19 @@ define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 } ; FUNC-LABEL: {{^}}v_usubo_i32: -; SI: v_subrev_i32_e32 +; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind +define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 + %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 %carry = extractvalue { i32, i1 } %usub, 1 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -49,16 +57,38 @@ define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 ret void } +; FUNC-LABEL: {{^}}v_usubo_i32_novcc: +; GCN: v_sub_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]] + +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +define amdgpu_kernel void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 + %uadd = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) + %val = extractvalue { i32, i1 } %uadd, 0 + %carry = extractvalue { i32, i1 } %uadd, 1 + store volatile i32 %val, i32 addrspace(1)* %out, align 4 + call void asm sideeffect "", "~{VCC}"() #0 + store volatile i1 %carry, i1 addrspace(1)* %carryout + ret void +} + ; FUNC-LABEL: {{^}}s_usubo_i64: -; SI: s_sub_u32 -; SI: s_subb_u32 +; GCN: s_sub_u32 +; GCN: s_subb_u32 ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT ; EG-DAG: SUB_INT ; EG: SUB_INT -define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind +define amdgpu_kernel void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 { + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 store i64 %val, i64 addrspace(1)* %out, align 8 @@ -67,20 +97,50 @@ define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 } ; FUNC-LABEL: {{^}}v_usubo_i64: -; SI: v_sub_i32 -; SI: v_subb_u32 +; GCN: v_sub_i32 +; GCN: v_subb_u32 ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT ; EG-DAG: SUB_INT ; EG: SUB_INT -define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind +define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr + %b.gep = getelementptr inbounds i64, i64 addrspace(1)* %b.ptr + %a = load i64, i64 addrspace(1)* %a.gep + %b = load i64, i64 addrspace(1)* %b.gep + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 store i64 %val, i64 addrspace(1)* %out, align 8 store i1 %carry, i1 addrspace(1)* %carryout ret void } + +; FUNC-LABEL: {{^}}v_usubo_i16: +; VI: v_subrev_u16_e32 +; VI: v_cmp_gt_u16_e32 +define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr + %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr + %a = load i16, i16 addrspace(1)* %a.gep + %b = load i16, i16 addrspace(1)* %b.gep + %usub = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %a, i16 %b) + %val = extractvalue { i16, i1 } %usub, 0 + %carry = extractvalue { i16, i1 } %usub, 1 + store i16 %val, i16 addrspace(1)* %out + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare { i16, i1 } @llvm.usub.with.overflow.i16(i16, i16) #1 +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 +declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll index a48e7acd4cf3..b7d766aa395e 100644 --- a/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll +++ b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll @@ -1,14 +1,14 @@ ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s ; CHECK-LABEL: {{^}}kernel_arg_i64: -define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { +define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { store i64 %a, i64 addrspace(1)* %out, align 8 ret void } ; i64 arg works, v1i64 arg does not. ; CHECK-LABEL: {{^}}kernel_arg_v1i64: -define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { +define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 ret void } diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll index 1cd49feb0d88..d4a68a418ee4 100644 --- a/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/test/CodeGen/AMDGPU/v_cndmask.ll @@ -4,12 +4,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; GCN-LABEL: {{^}}v_cnd_nan_nosgpr: -; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0 -; GCN: v_cndmask_b32_e32 v{{[0-9]}}, -1, v{{[0-9]+}}, vcc +; GCN: v_cmp_eq_u32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0 +; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]] ; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff ; GCN: s_endpgm -define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { +define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx %f = load float, float addrspace(1)* %f.gep @@ -30,7 +30,7 @@ define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace( ; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff ; GCN: s_endpgm -define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { +define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { %setcc = icmp ne i32 %c, 0 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f store float %select, float addrspace(1)* %out @@ -47,7 +47,7 @@ define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc -define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -62,7 +62,7 @@ define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float % ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc -define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -77,7 +77,7 @@ define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float % ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc -define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -92,7 +92,7 @@ define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float % ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc -define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 { +define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext @@ -105,9 +105,9 @@ define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float % ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GCN-DAG: s_load_dword [[X:s[0-9]+]] ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] -; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 -; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[Z]], vcc -define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { +; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 +; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]] +define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext @@ -122,9 +122,9 @@ define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float % ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] ; GCN-DAG: s_load_dword [[X:s[0-9]+]] -; GCN: v_cmp_nlg_f32_e64 vcc, [[X]], 0 -; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc -define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { +; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 +; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]] +define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext @@ -142,7 +142,7 @@ define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float % ; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]] ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc -define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -159,7 +159,7 @@ define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float a ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc -define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -178,7 +178,7 @@ define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float a ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] ; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]] ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc -define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext @@ -203,7 +203,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrs ; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s -define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext @@ -226,7 +226,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrs ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc -define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -249,7 +249,7 @@ define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc -define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -275,7 +275,7 @@ define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc -define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -298,7 +298,7 @@ define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s ; GCN: store_byte -define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext @@ -321,7 +321,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspa ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc -define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -343,7 +343,7 @@ define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, flo ; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc -define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext @@ -364,7 +364,7 @@ define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float ; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc -define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext @@ -386,7 +386,7 @@ define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 ; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]] ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc -define void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { +define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext diff --git a/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll b/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll index 9246ce38dbed..2cda52a8438a 100644 --- a/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll +++ b/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll @@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.cvt.pk.u8.f32(float, i32, i32) #0 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_0: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 0, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -13,7 +13,7 @@ define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_1: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -21,7 +21,7 @@ define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_2: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -29,7 +29,7 @@ define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_3: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 3, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void @@ -40,7 +40,7 @@ define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}} ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}} ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) { %result0 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0 %result1 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %result0) #0 %result2 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %result1) #0 @@ -51,7 +51,7 @@ define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %re ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx: ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) { +define amdgpu_kernel void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) { %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 %idx, i32 %reg) #0 store i32 %result, i32 addrspace(1)* %out, align 4 ret void diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll index 9a2dc743d6c9..2b96f7d50076 100644 --- a/test/CodeGen/AMDGPU/v_mac.ll +++ b/test/CodeGen/AMDGPU/v_mac.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s ; GCN-LABEL: {{^}}mac_vvv: ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} @@ -7,7 +8,7 @@ ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 ; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]] ; GCN: buffer_store_dword [[C]] -define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -25,7 +26,7 @@ entry: ; GCN-LABEL: {{^}}mad_inline_sgpr_inline: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5 -define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 { +define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 { entry: %tmp0 = fmul float 0.5, %in %tmp1 = fadd float %tmp0, 0.5 @@ -36,7 +37,7 @@ entry: ; GCN-LABEL: {{^}}mad_vvs: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 { +define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 @@ -51,7 +52,7 @@ entry: ; GCN-LABEL: {{^}}mac_ssv: ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 { +define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 { entry: %c = load float, float addrspace(1)* %in @@ -64,7 +65,7 @@ entry: ; GCN-LABEL: {{^}}mac_mad_same_add: ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} -define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -95,7 +96,7 @@ entry: ; GCN-LABEL: {{^}}mad_neg_src0: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -112,10 +113,10 @@ entry: ret void } -; GCN-LABEL: {{^}}unsafe_mad_sub0_src0: +; GCN-LABEL: {{^}}nsz_mad_sub0_src0: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define void @unsafe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -135,7 +136,7 @@ entry: ; GCN-LABEL: {{^}}safe_mad_sub0_src0: ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, ; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]] -define void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -155,7 +156,7 @@ entry: ; GCN-LABEL: {{^}}mad_neg_src1: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -172,10 +173,10 @@ entry: ret void } -; GCN-LABEL: {{^}}unsafe_mad_sub0_src1: +; GCN-LABEL: {{^}}nsz_mad_sub0_src1: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define void @unsafe_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -195,7 +196,7 @@ entry: ; GCN-LABEL: {{^}}mad_neg_src2: ; GCN-NOT: v_mac ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} -define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 @@ -221,7 +222,7 @@ entry: ; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] ; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 -define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 { +define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -247,12 +248,16 @@ bb: ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]] -; FIXME: How is this not folded? -; SI: v_cvt_f32_f16_e32 v{{[0-9]+}}, 0x3c00 +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[A]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]] -; VI: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] -; VI: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 -define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 { +; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]] +; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 +; SI: v_mac_f32_e32 v{{[0-9]+}}, 0x41000000, v{{[0-9]+}} + +; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] +; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 +define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -276,7 +281,7 @@ bb: declare i32 @llvm.amdgcn.workitem.id.x() #2 -attributes #0 = { nounwind "unsafe-fp-math"="false" } -attributes #1 = { nounwind "unsafe-fp-math"="true" } +attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } +attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } attributes #2 = { nounwind readnone } attributes #3 = { nounwind } diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll index 151f2cc9fc73..c45af522ec49 100644 --- a/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; GCN-LABEL: {{^}}mac_f16 +; GCN-LABEL: {{^}}mac_f16: ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]] @@ -14,7 +14,7 @@ ; VI: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]] ; VI: buffer_store_short v[[C_F16]] ; GCN: s_endpgm -define void @mac_f16( +define amdgpu_kernel void @mac_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -31,13 +31,14 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_same_add +; GCN-LABEL: {{^}}mac_f16_same_add: ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_same_add( +define amdgpu_kernel void @mac_f16_same_add( half addrspace(1)* %r0, half addrspace(1)* %r1, half addrspace(1)* %a, @@ -63,13 +64,16 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_f16_neg_a: +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_a( +define amdgpu_kernel void @mac_f16_neg_a( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -87,13 +91,16 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_f16_neg_b: +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_b( +define amdgpu_kernel void @mac_f16_neg_b( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -111,13 +118,16 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_f16_neg_c: +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_c( +define amdgpu_kernel void @mac_f16_neg_c( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -135,14 +145,13 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math -; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} -; SI: v_subrev_f32_e32 v[[NEG_A:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math: +; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] ; GCN: s_endpgm -define void @mac_f16_neg_a_safe_fp_math( +define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -160,14 +169,13 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math -; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} -; SI: v_subrev_f32_e32 v[[NEG_A:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math: +; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_b_safe_fp_math( +define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -185,14 +193,13 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math -; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} -; SI: v_subrev_f32_e32 v[[NEG_A:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] +; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math: +; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_f16_neg_c_safe_fp_math( +define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -210,13 +217,16 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math: +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} ; GCN: s_endpgm -define void @mac_f16_neg_a_unsafe_fp_math( +define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -234,13 +244,16 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math: +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} ; GCN: s_endpgm -define void @mac_f16_neg_b_unsafe_fp_math( +define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -258,13 +271,16 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} +; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math: +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} ; GCN: s_endpgm -define void @mac_f16_neg_c_unsafe_fp_math( +define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b, @@ -282,33 +298,38 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16 +; GCN-LABEL: {{^}}mac_v2f16: ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[C_F32_0]] -; SI: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] +; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; VI: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] -; VI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]] -; VI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]] +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]] + ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] ; GCN: s_endpgm -define void @mac_v2f16( +define amdgpu_kernel void @mac_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -325,17 +346,19 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_same_add -; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]] -; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]] -; SI: v_mac_f32_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mac_f32_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]] -; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]] -; VI: v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_v2f16_same_add: +; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + ; GCN: s_endpgm -define void @mac_v2f16_same_add( +define amdgpu_kernel void @mac_v2f16_same_add( <2 x half> addrspace(1)* %r0, <2 x half> addrspace(1)* %r1, <2 x half> addrspace(1)* %a, @@ -361,15 +384,18 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_v2f16_neg_a: +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} + +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_v2f16_neg_a( +define amdgpu_kernel void @mac_v2f16_neg_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -388,14 +414,17 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_b -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}} + + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_v2f16_neg_b( +define amdgpu_kernel void @mac_v2f16_neg_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -413,15 +442,22 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_v2f16_neg_c: +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} + +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT2]] +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT5]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; GCN: s_endpgm -define void @mac_v2f16_neg_c( +define amdgpu_kernel void @mac_v2f16_neg_c( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -439,18 +475,20 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math -; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} -; SI: v_subrev_f32_e32 v[[NEG_A0:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] -; SI: v_subrev_f32_e32 v[[NEG_A1:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math: + +; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} +; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] + ; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] -; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] + ; GCN: s_endpgm -define void @mac_v2f16_neg_a_safe_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -468,18 +506,20 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math -; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} -; SI: v_subrev_f32_e32 v[[NEG_A0:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] -; SI: v_subrev_f32_e32 v[[NEG_A1:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] -; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: + +; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} +; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} + ; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} + ; GCN: s_endpgm -define void @mac_v2f16_neg_b_safe_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -497,18 +537,20 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math -; SI: v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}} -; SI: v_subrev_f32_e32 v[[NEG_A0:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] -; SI: v_subrev_f32_e32 v[[NEG_A1:[0-9]+]], v{{[0-9]+}}, v[[ZERO]] -; SI: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: + +; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} +; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} + ; GCN: s_endpgm -define void @mac_v2f16_neg_c_safe_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -526,15 +568,22 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math: +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} + +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; GCN: s_endpgm -define void @mac_v2f16_neg_a_unsafe_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -552,15 +601,22 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math: +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} + +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; GCN: s_endpgm -define void @mac_v2f16_neg_b_unsafe_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -578,15 +634,22 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} +; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math: +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} + +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} ; GCN: s_endpgm -define void @mac_v2f16_neg_c_unsafe_fp_math( +define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -604,5 +667,5 @@ entry: ret void } -attributes #0 = {"unsafe-fp-math"="false"} -attributes #1 = {"unsafe-fp-math"="true"} +attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } +attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/v_madak_f16.ll b/test/CodeGen/AMDGPU/v_madak_f16.ll index df220d7a977b..bfb10503aaea 100644 --- a/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}madak_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -7,7 +7,7 @@ ; VI: v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}} ; VI: buffer_store_short v[[R_F16]] ; GCN: s_endpgm -define void @madak_f16( +define amdgpu_kernel void @madak_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -28,7 +28,7 @@ entry: ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm -define void @madak_f16_use_2( +define amdgpu_kernel void @madak_f16_use_2( half addrspace(1)* %r0, half addrspace(1)* %r1, half addrspace(1)* %a, diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll index e64f8467240a..85a8929ebe58 100644 --- a/test/CodeGen/AMDGPU/valu-i1.ll +++ b/test/CodeGen/AMDGPU/valu-i1.ll @@ -29,7 +29,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] ; SI-NEXT: ; mask branch ; -define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { +define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone switch i32 %tid, label %default [ @@ -64,29 +64,100 @@ end: ret void } -; SI-LABEL: @simple_test_v_if +; SI-LABEL: {{^}}simple_test_v_if: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] -; SI: BB{{[0-9]+_[0-9]+}}: +; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword -; SI: s_endpgm +; SI-NEXT: s_waitcnt -; SI: BB1_2: +; SI-NEXT: {{^}}[[EXIT]]: ; SI: s_or_b64 exec, exec, [[BR_SREG]] ; SI: s_endpgm -define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { +define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %is.0 = icmp ne i32 %tid, 0 - br i1 %is.0, label %store, label %exit + br i1 %is.0, label %then, label %exit + +then: + %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid + store i32 999, i32 addrspace(1)* %gep + br label %exit + +exit: + ret void +} + +; FIXME: It would be better to endpgm in the then block. + +; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: +; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] + +; SI-NEXT: BB{{[0-9]+_[0-9]+}}: +; SI: buffer_store_dword +; SI-NEXT: s_waitcnt + +; SI-NEXT: {{^}}[[EXIT]]: +; SI: s_or_b64 exec, exec, [[BR_SREG]] +; SI: s_endpgm +define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %is.0 = icmp ne i32 %tid, 0 + br i1 %is.0, label %then, label %exit + +then: + %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid + store i32 999, i32 addrspace(1)* %gep + ret void + +exit: + ret void +} + +; Final block has more than a ret to execute. This was miscompiled +; before function exit blocks were unified since the endpgm would +; terminate the then wavefront before reaching the store. + +; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret: +; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] + +; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit +; SI: ds_write_b32 +; SI: s_waitcnt + +; SI-NEXT: {{^}}[[FLOW]]: +; SI-NEXT: s_or_saveexec_b64 +; SI-NEXT: s_xor_b64 exec, exec +; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] + +; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then +; SI: buffer_store_dword +; SI-NEXT: s_waitcnt + +; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock +; SI: s_or_b64 exec, exec +; SI: s_endpgm +define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %is.0 = icmp ne i32 %tid, 0 + br i1 %is.0, label %then, label %exit -store: +then: %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid store i32 999, i32 addrspace(1)* %gep ret void exit: + store volatile i32 7, i32 addrspace(3)* undef ret void } @@ -101,12 +172,12 @@ exit: ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword -; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100 -; SI: s_cbranch_scc0 [[LABEL_LOOP]] +; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100 +; SI: s_cbranch_vccz [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm -define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { +define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %is.0 = icmp ne i32 %tid, 0 @@ -156,7 +227,7 @@ exit: ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 ; SI: buffer_store_dword -; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] +; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]] ; SI: [[LABEL_FLOW]]: @@ -173,7 +244,7 @@ exit: ; SI-NOT: [[COND_STATE]] ; SI: s_endpgm -define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { +define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp4 = sext i32 %tmp to i64 diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir index 03e473e3a0c0..5e5465800c3a 100644 --- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -1,7 +1,7 @@ # RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s --- | - define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 { + define amdgpu_kernel void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 { entry: %cmp0 = fcmp oeq float %cond, 0.000000e+00 br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 @@ -20,7 +20,7 @@ ret void } - define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { + define amdgpu_kernel void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { entry: br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll index 7dcf36f144ac..03cf725601b7 100644 --- a/test/CodeGen/AMDGPU/vector-alloca.ll +++ b/test/CodeGen/AMDGPU/vector-alloca.ll @@ -15,7 +15,7 @@ ; EG: MOV ; EG: MOV ; EG: MOVA_INT -define void @vector_read(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [4 x i32] %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 @@ -44,7 +44,7 @@ entry: ; EG: MOV ; EG: MOVA_INT ; EG: MOVA_INT -define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { entry: %tmp = alloca [4 x i32] %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 @@ -71,7 +71,7 @@ entry: ; FUNC-LABEL: {{^}}bitcast_gep: ; EG: STORE_RAW -define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { entry: %tmp = alloca [4 x i32] %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 @@ -93,7 +93,7 @@ entry: ; OPT-LABEL: @vector_read_bitcast_gep( ; OPT: %0 = extractelement <4 x i32> , i32 %index ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 -define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [4 x i32] %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 @@ -121,7 +121,7 @@ entry: ; OPT: store float ; OPT: store float ; OPT: load float -define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [4 x i32] %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]* diff --git a/test/CodeGen/AMDGPU/vector-extract-insert.ll b/test/CodeGen/AMDGPU/vector-extract-insert.ll index 2d39f82e2499..ab2bfcfd1fb7 100644 --- a/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 { +define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext @@ -30,7 +30,7 @@ define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> ; GCN: v_movreld_b32 ; GCN: v_movrels_b32 ; GCN: buffer_store_dword v -define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 { +define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext @@ -49,7 +49,7 @@ define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 { +define amdgpu_kernel void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext @@ -68,7 +68,7 @@ define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> ad ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOT: [[VVAL]] ; GCN: buffer_store_dword [[VVAL]] -define void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 { +define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext diff --git a/test/CodeGen/AMDGPU/vectorize-global-local.ll b/test/CodeGen/AMDGPU/vectorize-global-local.ll new file mode 100644 index 000000000000..90cf34e609f6 --- /dev/null +++ b/test/CodeGen/AMDGPU/vectorize-global-local.ll @@ -0,0 +1,80 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 + +define amdgpu_kernel void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) { +bb: + %tmp = load i32, i32 addrspace(1)* %arg, align 4 + store i32 %tmp, i32 addrspace(3)* %arg1, align 4 + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %tmp3 = load i32, i32 addrspace(1)* %tmp2, align 4 + %tmp4 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 1 + store i32 %tmp3, i32 addrspace(3)* %tmp4, align 4 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 + %tmp7 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 2 + store i32 %tmp6, i32 addrspace(3)* %tmp7, align 4 + %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + %tmp9 = load i32, i32 addrspace(1)* %tmp8, align 4 + %tmp10 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 3 + store i32 %tmp9, i32 addrspace(3)* %tmp10, align 4 + %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4 + %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4 + %tmp13 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 4 + store i32 %tmp12, i32 addrspace(3)* %tmp13, align 4 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 5 + %tmp15 = load i32, i32 addrspace(1)* %tmp14, align 4 + %tmp16 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 5 + store i32 %tmp15, i32 addrspace(3)* %tmp16, align 4 + %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 6 + %tmp18 = load i32, i32 addrspace(1)* %tmp17, align 4 + %tmp19 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 6 + store i32 %tmp18, i32 addrspace(3)* %tmp19, align 4 + %tmp20 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 7 + %tmp21 = load i32, i32 addrspace(1)* %tmp20, align 4 + %tmp22 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 7 + store i32 %tmp21, i32 addrspace(3)* %tmp22, align 4 + %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8 + %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4 + %tmp25 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 8 + store i32 %tmp24, i32 addrspace(3)* %tmp25, align 4 + %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 9 + %tmp27 = load i32, i32 addrspace(1)* %tmp26, align 4 + %tmp28 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 9 + store i32 %tmp27, i32 addrspace(3)* %tmp28, align 4 + %tmp29 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 10 + %tmp30 = load i32, i32 addrspace(1)* %tmp29, align 4 + %tmp31 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 10 + store i32 %tmp30, i32 addrspace(3)* %tmp31, align 4 + %tmp32 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 11 + %tmp33 = load i32, i32 addrspace(1)* %tmp32, align 4 + %tmp34 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 11 + store i32 %tmp33, i32 addrspace(3)* %tmp34, align 4 + %tmp35 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12 + %tmp36 = load i32, i32 addrspace(1)* %tmp35, align 4 + %tmp37 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 12 + store i32 %tmp36, i32 addrspace(3)* %tmp37, align 4 + %tmp38 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 13 + %tmp39 = load i32, i32 addrspace(1)* %tmp38, align 4 + %tmp40 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 13 + store i32 %tmp39, i32 addrspace(3)* %tmp40, align 4 + %tmp41 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 14 + %tmp42 = load i32, i32 addrspace(1)* %tmp41, align 4 + %tmp43 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 14 + store i32 %tmp42, i32 addrspace(3)* %tmp43, align 4 + %tmp44 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 15 + %tmp45 = load i32, i32 addrspace(1)* %tmp44, align 4 + %tmp46 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 15 + store i32 %tmp45, i32 addrspace(3)* %tmp46, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll index 3d71062f1fba..46a1c87184d1 100644 --- a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll +++ b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll @@ -6,7 +6,7 @@ ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00 -define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %v = load i32, i32 addrspace(1)* %in store i32 %v, i32 addrspace(1)* %out ret void @@ -16,7 +16,7 @@ define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ; EG: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00 ; CM: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x00,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x00,0x00 -define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +define amdgpu_kernel void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %v = load <4 x i32>, <4 x i32> addrspace(1)* %in store <4 x i32> %v, <4 x i32> addrspace(1)* %out ret void @@ -26,7 +26,7 @@ define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00 -define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) { +define amdgpu_kernel void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) { %v = load i32, i32 addrspace(7)* %in store i32 %v, i32 addrspace(1)* %out ret void @@ -38,7 +38,7 @@ define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) { @t = internal addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3] -define void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) { %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @t, i32 0, i32 %in %v = load i32, i32 addrspace(2)* %a store i32 %v, i32 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index a8908f87fbf6..e82e548f23cd 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s ; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s ; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s @@ -15,16 +16,17 @@ ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_flat_scratch_init = 0 -; HSA: workitem_private_segment_byte_size = 1024 +; HSA: workitem_private_segment_byte_size = 1536 ; GCN-NOT: flat_scr ; GCNMESA-DAG: s_mov_b32 s16, s3 ; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCNMESA--DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCNMESA-DAG: s_mov_b32 s14, -1 ; SIMESA-DAG: s_mov_b32 s15, 0xe8f000 ; VIMESA-DAG: s_mov_b32 s15, 0xe80000 +; GFX9MESA-DAG: s_mov_b32 s15, 0xe00000 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill @@ -40,10 +42,10 @@ ; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; GCN: NumVgprs: 256 -; GCN: ScratchSize: 1024 +; GCN: ScratchSize: 1536 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset. -define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { +define amdgpu_kernel void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { bb: %tmp = add i32 %arg1, %arg2 %tmp7 = extractelement <4 x float> %arg6, i32 0 diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 4de35b97aeab..c9c8583d5e87 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; This ends up using all 255 registers and requires register ; scavenging which will fail to find an unsued register. @@ -12,19 +13,19 @@ ; GCN-LABEL: {{^}}main: -; GCN-DAG: s_mov_b32 s11, s12 -; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s14, -1 -; SI-DAG: s_mov_b32 s15, 0xe8f000 -; VI-DAG: s_mov_b32 s15, 0xe80000 - -; s11 is offset system SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload +; GCN-DAG: s_mov_b32 s[[OFFREG:[0-9]+]], s12 +; GCN-DAG: s_mov_b32 s[[DESC0:[0-9]+]], SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1 +; SI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe8f000 +; VI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe80000 +; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000 +; OFFREG is offset system SGPR +; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Reload ; GCN: NumVgprs: 256 -; GCN: ScratchSize: 1024 +; GCN: ScratchSize: 1536 define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: @@ -36,7 +37,8 @@ bb: %tmp15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0 %tmp16 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp15, align 16, !tbaa !0 %tmp17 = add i32 %arg5, %arg7 - %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp16, i32 0, i32 %tmp17) + %tmp16.cast = bitcast <16 x i8> %tmp16 to <4 x i32> + %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i1 false, i1 false) %tmp19 = extractelement <4 x float> %tmp18, i32 0 %tmp20 = extractelement <4 x float> %tmp18, i32 1 %tmp21 = extractelement <4 x float> %tmp18, i32 2 @@ -180,39 +182,39 @@ bb24: ; preds = %bb157, %bb br i1 %tmp155, label %bb156, label %bb157 bb156: ; preds = %bb24 - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp12, float %tmp103, float %tmp102, float %tmp101) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 33, i32 0, float %tmp99, float %tmp98, float %tmp97, float %tmp95) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 34, i32 0, float %tmp94, float %tmp93, float %tmp91, float %tmp90) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 35, i32 0, float %tmp89, float %tmp87, float %tmp86, float %tmp85) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 36, i32 0, float %tmp83, float %tmp82, float %tmp81, float %tmp79) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 37, i32 0, float %tmp78, float %tmp77, float %tmp75, float %tmp74) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 38, i32 0, float %tmp73, float %tmp71, float %tmp70, float %tmp69) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 39, i32 0, float %tmp67, float %tmp66, float %tmp65, float %tmp63) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 40, i32 0, float %tmp62, float %tmp61, float %tmp59, float %tmp58) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 41, i32 0, float %tmp57, float %tmp55, float %tmp54, float %tmp53) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 42, i32 0, float %tmp51, float %tmp50, float %tmp49, float %tmp47) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 43, i32 0, float %tmp46, float %tmp45, float %tmp43, float %tmp42) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 44, i32 0, float %tmp41, float %tmp39, float %tmp38, float %tmp37) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 45, i32 0, float %tmp35, float %tmp34, float %tmp33, float %tmp31) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 46, i32 0, float %tmp30, float %tmp29, float %tmp27, float %tmp26) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 47, i32 0, float %tmp25, float %tmp28, float %tmp32, float %tmp36) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 48, i32 0, float %tmp40, float %tmp44, float %tmp48, float %tmp52) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 49, i32 0, float %tmp56, float %tmp60, float %tmp64, float %tmp68) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 50, i32 0, float %tmp72, float %tmp76, float %tmp80, float %tmp84) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 51, i32 0, float %tmp88, float %tmp92, float %tmp96, float %tmp100) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 52, i32 0, float %tmp104, float %tmp105, float %tmp106, float %tmp108) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 53, i32 0, float %tmp109, float %tmp110, float %tmp111, float %tmp112) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 54, i32 0, float %tmp113, float %tmp114, float %tmp115, float %tmp116) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 55, i32 0, float %tmp117, float %tmp118, float %tmp119, float %tmp120) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 56, i32 0, float %tmp121, float %tmp122, float %tmp123, float %tmp124) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 57, i32 0, float %tmp125, float %tmp126, float %tmp127, float %tmp128) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 58, i32 0, float %tmp129, float %tmp130, float %tmp131, float %tmp132) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 59, i32 0, float %tmp133, float %tmp134, float %tmp135, float %tmp136) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 60, i32 0, float %tmp137, float %tmp138, float %tmp139, float %tmp140) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 61, i32 0, float %tmp141, float %tmp142, float %tmp143, float %tmp144) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 62, i32 0, float %tmp145, float %tmp146, float %tmp147, float %tmp148) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float %tmp149, float %tmp150, float %tmp151, float %tmp13) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp12, float %tmp103, float %tmp102, float %tmp101, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %tmp99, float %tmp98, float %tmp97, float %tmp95, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 34, i32 15, float %tmp94, float %tmp93, float %tmp91, float %tmp90, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 35, i32 15, float %tmp89, float %tmp87, float %tmp86, float %tmp85, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 36, i32 15, float %tmp83, float %tmp82, float %tmp81, float %tmp79, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 37, i32 15, float %tmp78, float %tmp77, float %tmp75, float %tmp74, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 38, i32 15, float %tmp73, float %tmp71, float %tmp70, float %tmp69, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 39, i32 15, float %tmp67, float %tmp66, float %tmp65, float %tmp63, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 40, i32 15, float %tmp62, float %tmp61, float %tmp59, float %tmp58, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 41, i32 15, float %tmp57, float %tmp55, float %tmp54, float %tmp53, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 42, i32 15, float %tmp51, float %tmp50, float %tmp49, float %tmp47, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 43, i32 15, float %tmp46, float %tmp45, float %tmp43, float %tmp42, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 44, i32 15, float %tmp41, float %tmp39, float %tmp38, float %tmp37, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 45, i32 15, float %tmp35, float %tmp34, float %tmp33, float %tmp31, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 46, i32 15, float %tmp30, float %tmp29, float %tmp27, float %tmp26, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 47, i32 15, float %tmp25, float %tmp28, float %tmp32, float %tmp36, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 48, i32 15, float %tmp40, float %tmp44, float %tmp48, float %tmp52, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 49, i32 15, float %tmp56, float %tmp60, float %tmp64, float %tmp68, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 50, i32 15, float %tmp72, float %tmp76, float %tmp80, float %tmp84, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 51, i32 15, float %tmp88, float %tmp92, float %tmp96, float %tmp100, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 52, i32 15, float %tmp104, float %tmp105, float %tmp106, float %tmp108, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 53, i32 15, float %tmp109, float %tmp110, float %tmp111, float %tmp112, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 54, i32 15, float %tmp113, float %tmp114, float %tmp115, float %tmp116, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 55, i32 15, float %tmp117, float %tmp118, float %tmp119, float %tmp120, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 56, i32 15, float %tmp121, float %tmp122, float %tmp123, float %tmp124, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 57, i32 15, float %tmp125, float %tmp126, float %tmp127, float %tmp128, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 58, i32 15, float %tmp129, float %tmp130, float %tmp131, float %tmp132, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 59, i32 15, float %tmp133, float %tmp134, float %tmp135, float %tmp136, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 60, i32 15, float %tmp137, float %tmp138, float %tmp139, float %tmp140, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 61, i32 15, float %tmp141, float %tmp142, float %tmp143, float %tmp144, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 62, i32 15, float %tmp145, float %tmp146, float %tmp147, float %tmp148, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float %tmp149, float %tmp150, float %tmp151, float %tmp13, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 true, i1 false) #0 ret void bb157: ; preds = %bb24 @@ -483,18 +485,15 @@ bb157: ; preds = %bb24 br label %bb24 } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll index ad7521a3da9b..8d66c346ed5b 100644 --- a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll +++ b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll @@ -1,10 +1,10 @@ ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; ERROR: error: :1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget +; ERROR: error: foo.cl:1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget declare float @llvm.amdgcn.rsq.legacy(float) #0 -define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { +define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4 store float %rsq, float addrspace(1)* %out, align 4 ret void @@ -21,4 +21,4 @@ attributes #1 = { nounwind } !2 = !{i32 2, !"Dwarf Version", i32 4} !3 = !{i32 2, !"Debug Info Version", i32 3} !4 = !DILocation(line: 1, column: 42, scope: !5) -!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0) +!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, file: !1, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0) diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll index ae8ec58270c1..d2708b068eb4 100644 --- a/test/CodeGen/AMDGPU/vop-shrink.ll +++ b/test/CodeGen/AMDGPU/vop-shrink.ll @@ -8,7 +8,7 @@ ; ModuleID = 'vop-shrink.ll' -define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) { +define amdgpu_kernel void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) { entry: %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1 %tmp = icmp eq i32 %cond, 0 @@ -35,7 +35,7 @@ endif: ; preds = %else, %if ; FUNC-LABEL: {{^}}add_fold: ; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000 -define void @add_fold(float addrspace(1)* %out) { +define amdgpu_kernel void @add_fold(float addrspace(1)* %out) { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = uitofp i32 %tmp to float diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll index fe5be7526b19..bb6234729f90 100644 --- a/test/CodeGen/AMDGPU/vselect.ll +++ b/test/CodeGen/AMDGPU/vselect.ll @@ -10,7 +10,7 @@ ; SI: v_cndmask_b32_e64 ; SI: v_cndmask_b32_e32 -define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { +define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { entry: %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0 %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1 @@ -28,7 +28,7 @@ entry: ;SI: v_cndmask_b32_e64 ;SI: v_cndmask_b32_e32 -define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { +define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { entry: %0 = load <2 x float>, <2 x float> addrspace(1)* %in0 %1 = load <2 x float>, <2 x float> addrspace(1)* %in1 @@ -52,7 +52,7 @@ entry: ; SI: v_cndmask_b32 ; SI: v_cndmask_b32 -define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { +define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { entry: %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0 %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1 @@ -68,7 +68,7 @@ entry: ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { +define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { entry: %0 = load <4 x float>, <4 x float> addrspace(1)* %in0 %1 = load <4 x float>, <4 x float> addrspace(1)* %in1 diff --git a/test/CodeGen/AMDGPU/vselect64.ll b/test/CodeGen/AMDGPU/vselect64.ll index ef85ebe7899f..4a0435565161 100644 --- a/test/CodeGen/AMDGPU/vselect64.ll +++ b/test/CodeGen/AMDGPU/vselect64.ll @@ -5,7 +5,7 @@ ; Make sure the vectors aren't being stored on the stack. We know they are ; being stored on the stack if the shaders uses at leat 10 registers. ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X -define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) { +define amdgpu_kernel void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) { entry: %cmp = icmp ne <4 x i32> %c, %result = select <4 x i1> %cmp, <4 x i64> , <4 x i64> diff --git a/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll index 4584d6e25254..4c5eb3d3aa5d 100644 --- a/test/CodeGen/AMDGPU/vtx-fetch-branch.ll +++ b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll @@ -10,7 +10,7 @@ ; CHECK-NOT: ALU_POP_AFTER ; CHECK: TEX ; CHECK-NEXT: POP -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { entry: %0 = icmp eq i32 %cond, 0 br i1 %0, label %endif, label %if diff --git a/test/CodeGen/AMDGPU/vtx-schedule.ll b/test/CodeGen/AMDGPU/vtx-schedule.ll index 912e258ebb83..c4b619bf168f 100644 --- a/test/CodeGen/AMDGPU/vtx-schedule.ll +++ b/test/CodeGen/AMDGPU/vtx-schedule.ll @@ -9,7 +9,7 @@ ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0 ; CHECK: Fetch clause ; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0 -define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) { +define amdgpu_kernel void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) { entry: %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0 %1 = load i32, i32 addrspace(1)* %0 diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll index 621c582fcefd..623cbeae8da9 100644 --- a/test/CodeGen/AMDGPU/wait.ll +++ b/test/CodeGen/AMDGPU/wait.ll @@ -11,26 +11,27 @@ ; DEFAULT: exp ; DEFAULT: s_waitcnt lgkmcnt(0) ; DEFAULT: s_endpgm -define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) { +define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0 %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6) + %tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32> + %tmp11 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i1 false, i1 false) %tmp12 = extractelement <4 x float> %tmp11, i32 0 %tmp13 = extractelement <4 x float> %tmp11, i32 1 call void @llvm.amdgcn.s.barrier() #1 %tmp14 = extractelement <4 x float> %tmp11, i32 2 -; %tmp15 = extractelement <4 x float> %tmp11, i32 3 - %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt + %tmp15 = load float, float addrspace(2)* %constptr, align 4 %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1 %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0 - %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6) + %tmp17.cast = bitcast <16 x i8> %tmp17 to <4 x i32> + %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp17.cast, i32 %arg6, i32 0, i1 false, i1 false) %tmp19 = extractelement <4 x float> %tmp18, i32 0 %tmp20 = extractelement <4 x float> %tmp18, i32 1 %tmp21 = extractelement <4 x float> %tmp18, i32 2 %tmp22 = extractelement <4 x float> %tmp18, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15) + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp12, float %tmp13, float %tmp14, float %tmp15, i1 true, i1 false) #0 ret void } @@ -41,45 +42,42 @@ main_body: ; ILPMAX: s_load_dwordx4 ; ILPMAX: s_waitcnt lgkmcnt(0) ; ILPMAX: buffer_load -; ILPMAX: s_waitcnt vmcnt(1) ; ILPMAX: s_waitcnt vmcnt(0) +; ILPMAX: exp pos0 +; ILPMAX-NEXT: exp param0 ; ILPMAX: s_endpgm - -define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* -byval, i32 inreg, i32 inreg, i32, i32, i32, i32) { +define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { main_body: - %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0 - %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0 - %13 = add i32 %5, %7 - %14 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %12, i32 0, i32 %13) - %15 = extractelement <4 x float> %14, i32 0 - %16 = extractelement <4 x float> %14, i32 1 - %17 = extractelement <4 x float> %14, i32 2 - %18 = extractelement <4 x float> %14, i32 3 - %19 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1 - %20 = load <16 x i8>, <16 x i8> addrspace(2)* %19, align 16, !tbaa !0 - %21 = add i32 %5, %7 - %22 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %20, i32 0, i32 %21) - %23 = extractelement <4 x float> %22, i32 0 - %24 = extractelement <4 x float> %22, i32 1 - %25 = extractelement <4 x float> %22, i32 2 - %26 = extractelement <4 x float> %22, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %15, float %16, float %17, float %18) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %23, float %24, float %25, float %26) + %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0 + %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0 + %tmp12 = add i32 %arg5, %arg7 + %tmp11.cast = bitcast <16 x i8> %tmp11 to <4 x i32> + %tmp13 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp11.cast, i32 %tmp12, i32 0, i1 false, i1 false) + %tmp14 = extractelement <4 x float> %tmp13, i32 0 + %tmp15 = extractelement <4 x float> %tmp13, i32 1 + %tmp16 = extractelement <4 x float> %tmp13, i32 2 + %tmp17 = extractelement <4 x float> %tmp13, i32 3 + %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 1 + %tmp19 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp18, align 16, !tbaa !0 + %tmp20 = add i32 %arg5, %arg7 + %tmp19.cast = bitcast <16 x i8> %tmp19 to <4 x i32> + %tmp21 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp19.cast, i32 %tmp20, i32 0, i1 false, i1 false) + %tmp22 = extractelement <4 x float> %tmp21, i32 0 + %tmp23 = extractelement <4 x float> %tmp21, i32 1 + %tmp24 = extractelement <4 x float> %tmp21, i32 2 + %tmp25 = extractelement <4 x float> %tmp21, i32 3 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp14, float %tmp15, float %tmp16, float %tmp17, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp22, float %tmp23, float %tmp24, float %tmp25, i1 true, i1 false) #0 ret void } - -; Function Attrs: convergent nounwind declare void @llvm.amdgcn.s.barrier() #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - +attributes #0 = { nounwind } attributes #1 = { convergent nounwind } -attributes #2 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/test/CodeGen/AMDGPU/waitcnt-flat.ll b/test/CodeGen/AMDGPU/waitcnt-flat.ll index d29bae45d8c2..5d86b12da95f 100644 --- a/test/CodeGen/AMDGPU/waitcnt-flat.ll +++ b/test/CodeGen/AMDGPU/waitcnt-flat.ll @@ -9,7 +9,7 @@ ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]] ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0) ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}] -define void @test(i32 addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) { store volatile i32 0, i32 addrspace(1)* %out %val = load volatile i32, i32 addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/waitcnt.mir b/test/CodeGen/AMDGPU/waitcnt.mir index cb5de6a2419d..38662e83b359 100644 --- a/test/CodeGen/AMDGPU/waitcnt.mir +++ b/test/CodeGen/AMDGPU/waitcnt.mir @@ -1,12 +1,21 @@ # RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waits %s -o - | FileCheck %s --- | - define void @flat_zero_waitcnt(i32 addrspace(1)* %global4, + define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4, <4 x i32> addrspace(1)* %global16, i32 addrspace(4)* %flat4, <4 x i32> addrspace(4)* %flat16) { ret void } + + define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() { + ret void + } + + define amdgpu_kernel void @single_branch_successor_not_next_block() { + ret void + } + ... --- @@ -21,18 +30,21 @@ # CHECK-LABEL: bb.1: # CHECK: FLAT_LOAD_DWORD +# CHECK: S_WAITCNT 3952 # CHECK: FLAT_LOAD_DWORDX4 # The first load has no mem operand, so we should assume it accesses the flat # address space. # s_waitcnt vmcnt(0) lgkmcnt(0) -# CHECK-NEXT: S_WAITCNT 112 +# CHECK-NEXT: S_WAITCNT 127 # CHECK-LABEL: bb.2: # CHECK: FLAT_LOAD_DWORD +# CHECK: S_WAITCNT 3952 # CHECK: FLAT_LOAD_DWORDX4 + # One outstand loads access the flat address space. # s_waitcnt vmcnt(0) lgkmcnt(0) -# CHECK-NEXT: S_WAITCNT 112 +# CHECK-NEXT: S_WAITCNT 127 name: flat_zero_waitcnt @@ -57,3 +69,60 @@ body: | %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec S_ENDPGM ... +--- +# There is only a single fallthrough successor block, so there's no +# need to wait immediately. + +# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait +# CHECK: %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2 +# CHECK-NOT: S_WAITCNT + +# CHECK: bb.1: +# CHECK-NEXT: V_LSHLREV_B64 +# CHECK-NEXT: S_WAITCNT 112 +# CHECK-NEXT: FLAT_STORE_DWORD +name: single_fallthrough_successor_no_end_block_wait + +body: | + bb.0: + successors: %bb.1 + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + + bb.1: + %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# The block has a single predecessor with a single successor, but it +# is not the next block so it's non-obvious that the wait is not needed. + + +# CHECK-LABEL: name: single_branch_successor_not_next_block +# CHECK: %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2 +# CHECK-NEXT: S_WAITCNT 112 + +# CHECK: bb.1 +# CHECK-NEXT: FLAT_STORE_DWORD +# CHECK-NEXT: S_ENDPGM + +# CHECK: bb.2: +# CHECK-NEXT: V_LSHLREV_B64 +# CHECK-NEXT: FLAT_STORE_DWORD +name: single_branch_successor_not_next_block + +body: | + bb.0: + successors: %bb.2 + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + S_BRANCH %bb.2 + + bb.1: + FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM + + bb.2: + %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll index 3f7b2b284c53..9f277b2c9a59 100644 --- a/test/CodeGen/AMDGPU/wqm.ll +++ b/test/CodeGen/AMDGPU/wqm.ll @@ -1,5 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s ; Check that WQM isn't triggered by image load/store intrinsics. ; @@ -18,16 +18,14 @@ main_body: ;CHECK-NEXT: ; %main_body ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK-NOT: exec -define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { +define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) { main_body: - %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %c.2 = bitcast <4 x float> %c.1 to <4 x i32> %c.3 = extractelement <4 x i32> %c.2, i32 0 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3 %data = load float, float addrspace(1)* %gep - - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef) - + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1 ret void } @@ -42,9 +40,9 @@ main_body: ;CHECK: store ;CHECK-NOT: exec ;CHECK: .size test3 -define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) { +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x float> %c) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %tex.1 = bitcast <4 x float> %tex to <4 x i32> %tex.2 = extractelement <4 x i32> %tex.1, i32 0 @@ -70,10 +68,9 @@ main_body: %c.1 = mul i32 %c, %d call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0) - - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.1.bc = bitcast i32 %c.1 to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %dtex } @@ -101,9 +98,9 @@ main_body: br i1 %cmp, label %IF, label %ELSE IF: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %data.if = extractelement <4 x float> %dtex, i32 0 br label %END @@ -143,9 +140,9 @@ main_body: br i1 %cmp, label %ELSE, label %IF IF: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %data.if = extractelement <4 x float> %dtex, i32 0 br label %END @@ -200,7 +197,8 @@ ELSE: END: %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %coord.END.bc = bitcast i32 %coord.END to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %tex } @@ -215,13 +213,11 @@ END: ;CHECK: image_sample ;CHECK: v_cmp ;CHECK: store -define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) { +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %dtex.1 = extractelement <4 x float> %dtex, i32 0 - call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) %cc = fcmp ogt float %dtex.1, 0.0 @@ -254,7 +250,7 @@ END: ;CHECK: %END ;CHECK: image_sample ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) { +define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { main_body: %cond = icmp eq i32 %y, 0 br i1 %cond, label %IF, label %END @@ -265,9 +261,8 @@ IF: br label %END END: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %dtex } @@ -286,10 +281,9 @@ END: ;CHECK: buffer_store_dword ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %idx.0 = extractelement <2 x i32> %idx, i32 0 %data.0 = extractelement <2 x float> %data, i32 0 call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) @@ -299,10 +293,8 @@ main_body: %idx.1 = extractelement <2 x i32> %idx, i32 1 %data.1 = extractelement <2 x float> %data, i32 1 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) - - %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %out = fadd <4 x float> %tex, %dtex ret <4 x float> %out @@ -320,11 +312,10 @@ main_body: ; CHECK: buffer_store_dword ; CHECK-NOT: wqm ; CHECK: v_cmpx_ -define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) @@ -375,8 +366,7 @@ loop: br i1 %cc, label %break, label %body body: - %c.i = bitcast <4 x float> %c.iv to <4 x i32> - %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.next = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c.iv, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %ctr.next = fadd float %ctr.iv, 2.0 br label %loop @@ -394,7 +384,7 @@ break: ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 ; CHECK: s_wqm_b64 exec, exec -; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+$}} +; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}} ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen ; CHECK: s_wqm_b64 exec, exec @@ -416,9 +406,8 @@ entry: %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx %c = load i32, i32* %c.gep, align 4 - - %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %c.bc = bitcast i32 %c to float + %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) ret void @@ -436,9 +425,8 @@ entry: ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { - %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.i = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %dtex } @@ -450,10 +438,8 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { entry: - %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.i = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %cc = icmp sgt i32 %c, 0 br i1 %cc, label %if, label %else @@ -485,35 +471,29 @@ main_body: br i1 %cc, label %if, label %else if: - %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r.if = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float 0.0, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 br label %end else: - %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r.else = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 br label %end end: %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] - call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) - ret <4 x float> %r } - +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 - -declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 - -declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 - -declare void @llvm.AMDGPU.kill(float) -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 +declare void @llvm.AMDGPU.kill(float) #1 attributes #1 = { nounwind } attributes #2 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll index deac809f9b05..b1ee016e99c9 100644 --- a/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll +++ b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll @@ -10,7 +10,7 @@ declare void @llvm.write_register.i32(metadata, i32) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 -define void @write_vgpr_into_sgpr() { +define amdgpu_kernel void @write_vgpr_into_sgpr() { %tid = call i32 @llvm.amdgcn.workitem.id.x() call void @llvm.write_register.i32(metadata !0, i32 %tid) ret void diff --git a/test/CodeGen/AMDGPU/write_register.ll b/test/CodeGen/AMDGPU/write_register.ll index 88660ba6ec6a..9c62e003dde0 100644 --- a/test/CodeGen/AMDGPU/write_register.ll +++ b/test/CodeGen/AMDGPU/write_register.ll @@ -4,7 +4,7 @@ declare void @llvm.write_register.i32(metadata, i32) #0 declare void @llvm.write_register.i64(metadata, i64) #0 ; CHECK-LABEL: {{^}}test_write_m0: -define void @test_write_m0(i32 %val) #0 { +define amdgpu_kernel void @test_write_m0(i32 %val) #0 { call void @llvm.write_register.i32(metadata !0, i32 0) call void @llvm.write_register.i32(metadata !0, i32 -1) call void @llvm.write_register.i32(metadata !0, i32 %val) @@ -15,7 +15,7 @@ define void @test_write_m0(i32 %val) #0 { ; CHECK: s_mov_b64 exec, 0 ; CHECK: s_mov_b64 exec, -1 ; CHECK: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}} -define void @test_write_exec(i64 %val) #0 { +define amdgpu_kernel void @test_write_exec(i64 %val) #0 { call void @llvm.write_register.i64(metadata !1, i64 0) call void @llvm.write_register.i64(metadata !1, i64 -1) call void @llvm.write_register.i64(metadata !1, i64 %val) @@ -26,7 +26,7 @@ define void @test_write_exec(i64 %val) #0 { ; CHECK: s_mov_b64 flat_scratch, 0 ; CHECK: s_mov_b64 flat_scratch, -1 ; CHECK: s_mov_b64 flat_scratch, s{{\[[0-9]+:[0-9]+\]}} -define void @test_write_flat_scratch(i64 %val) #0 { +define amdgpu_kernel void @test_write_flat_scratch(i64 %val) #0 { call void @llvm.write_register.i64(metadata !2, i64 0) call void @llvm.write_register.i64(metadata !2, i64 -1) call void @llvm.write_register.i64(metadata !2, i64 %val) @@ -36,7 +36,7 @@ define void @test_write_flat_scratch(i64 %val) #0 { ; CHECK-LABEL: {{^}}test_write_flat_scratch_lo: ; CHECK: s_mov_b32 flat_scratch_lo, 0 ; CHECK: s_mov_b32 flat_scratch_lo, s{{[0-9]+}} -define void @test_write_flat_scratch_lo(i32 %val) #0 { +define amdgpu_kernel void @test_write_flat_scratch_lo(i32 %val) #0 { call void @llvm.write_register.i32(metadata !3, i32 0) call void @llvm.write_register.i32(metadata !3, i32 %val) ret void @@ -45,7 +45,7 @@ define void @test_write_flat_scratch_lo(i32 %val) #0 { ; CHECK-LABEL: {{^}}test_write_flat_scratch_hi: ; CHECK: s_mov_b32 flat_scratch_hi, 0 ; CHECK: s_mov_b32 flat_scratch_hi, s{{[0-9]+}} -define void @test_write_flat_scratch_hi(i32 %val) #0 { +define amdgpu_kernel void @test_write_flat_scratch_hi(i32 %val) #0 { call void @llvm.write_register.i32(metadata !4, i32 0) call void @llvm.write_register.i32(metadata !4, i32 %val) ret void @@ -54,7 +54,7 @@ define void @test_write_flat_scratch_hi(i32 %val) #0 { ; CHECK-LABEL: {{^}}test_write_exec_lo: ; CHECK: s_mov_b32 exec_lo, 0 ; CHECK: s_mov_b32 exec_lo, s{{[0-9]+}} -define void @test_write_exec_lo(i32 %val) #0 { +define amdgpu_kernel void @test_write_exec_lo(i32 %val) #0 { call void @llvm.write_register.i32(metadata !5, i32 0) call void @llvm.write_register.i32(metadata !5, i32 %val) ret void @@ -63,7 +63,7 @@ define void @test_write_exec_lo(i32 %val) #0 { ; CHECK-LABEL: {{^}}test_write_exec_hi: ; CHECK: s_mov_b32 exec_hi, 0 ; CHECK: s_mov_b32 exec_hi, s{{[0-9]+}} -define void @test_write_exec_hi(i32 %val) #0 { +define amdgpu_kernel void @test_write_exec_hi(i32 %val) #0 { call void @llvm.write_register.i32(metadata !6, i32 0) call void @llvm.write_register.i32(metadata !6, i32 %val) ret void diff --git a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll index 7f6b80459047..36532365d871 100644 --- a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll +++ b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll @@ -4,7 +4,7 @@ ;CHECK: {{^}}fill3d: ;CHECK-NOT: MULLO_INT T[0-9]+ -define void @fill3d(i32 addrspace(1)* nocapture %out) #0 { +define amdgpu_kernel void @fill3d(i32 addrspace(1)* nocapture %out) #0 { entry: %x.i = tail call i32 @llvm.r600.read.global.size.x() #1 %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1 diff --git a/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll b/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll index babae9ead27c..88ef9fd93c8f 100644 --- a/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll +++ b/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll @@ -5,7 +5,7 @@ ; TODO: enable doubles ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32: -define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { +define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { %val = load double, double addrspace(1)* %in, align 8 %add = fadd double %val, 4.0 %bc = bitcast double %add to <2 x i32> @@ -14,7 +14,7 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace } ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64: -define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) { +define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -30,7 +30,7 @@ end: } ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64: -define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) { +define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) { entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll index bf02d4c3b311..57a082a0170c 100644 --- a/test/CodeGen/AMDGPU/xor.ll +++ b/test/CodeGen/AMDGPU/xor.ll @@ -10,7 +10,7 @@ ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { +define amdgpu_kernel void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0 %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1 %result = xor <2 x i32> %a, %b @@ -29,7 +29,7 @@ define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { +define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0 %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1 %result = xor <4 x i32> %a, %b @@ -46,7 +46,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm -define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { +define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { %a = load float, float addrspace(1) * %in0 %b = load float, float addrspace(1) * %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -63,7 +63,7 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] ; SI: buffer_store_byte [[RESULT]] -define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { +define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { %a = load volatile i1, i1 addrspace(1)* %in0 %b = load volatile i1, i1 addrspace(1)* %in1 %xor = xor i1 %a, %b @@ -73,7 +73,7 @@ define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace ; FUNC-LABEL: {{^}}vector_xor_i32: ; SI: v_xor_b32_e32 -define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { +define amdgpu_kernel void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { %a = load i32, i32 addrspace(1)* %in0 %b = load i32, i32 addrspace(1)* %in1 %result = xor i32 %a, %b @@ -83,7 +83,7 @@ define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 ; FUNC-LABEL: {{^}}scalar_xor_i32: ; SI: s_xor_b32 -define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define amdgpu_kernel void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %result = xor i32 %a, %b store i32 %result, i32 addrspace(1)* %out ret void @@ -91,7 +91,7 @@ define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { ; FUNC-LABEL: {{^}}scalar_not_i32: ; SI: s_not_b32 -define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) { +define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) { %result = xor i32 %a, -1 store i32 %result, i32 addrspace(1)* %out ret void @@ -99,7 +99,7 @@ define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) { ; FUNC-LABEL: {{^}}vector_not_i32: ; SI: v_not_b32 -define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { +define amdgpu_kernel void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { %a = load i32, i32 addrspace(1)* %in0 %b = load i32, i32 addrspace(1)* %in1 %result = xor i32 %a, -1 @@ -111,7 +111,7 @@ define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 ; SI: v_xor_b32_e32 ; SI: v_xor_b32_e32 ; SI: s_endpgm -define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { +define amdgpu_kernel void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { %a = load i64, i64 addrspace(1)* %in0 %b = load i64, i64 addrspace(1)* %in1 %result = xor i64 %a, %b @@ -122,7 +122,7 @@ define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 ; FUNC-LABEL: {{^}}scalar_xor_i64: ; SI: s_xor_b64 ; SI: s_endpgm -define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = xor i64 %a, %b store i64 %result, i64 addrspace(1)* %out ret void @@ -130,7 +130,7 @@ define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; FUNC-LABEL: {{^}}scalar_not_i64: ; SI: s_not_b64 -define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) { %result = xor i64 %a, -1 store i64 %result, i64 addrspace(1)* %out ret void @@ -139,7 +139,7 @@ define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) { ; FUNC-LABEL: {{^}}vector_not_i64: ; SI: v_not_b32 ; SI: v_not_b32 -define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { +define amdgpu_kernel void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { %a = load i64, i64 addrspace(1)* %in0 %b = load i64, i64 addrspace(1)* %in1 %result = xor i64 %a, -1 @@ -153,7 +153,7 @@ define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 ; FUNC-LABEL: {{^}}xor_cf: ; SI: s_xor_b64 -define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) { +define amdgpu_kernel void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) { entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else @@ -178,7 +178,7 @@ endif: ; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]] ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]] -define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) { %or = xor i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out ret void @@ -192,7 +192,7 @@ define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) { ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] -define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %or = xor i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out @@ -211,7 +211,7 @@ define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i6 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] ; SI-NOT: xor_b32 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = xor i64 %a, 63 store i64 %or, i64 addrspace(1)* %out ret void @@ -220,7 +220,7 @@ define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64: ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; SI: s_xor_b64 [[VAL]], [[VAL]], -8 -define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { +define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { %or = xor i64 %a, -8 store i64 %or, i64 addrspace(1)* %out ret void @@ -231,7 +231,7 @@ define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { ; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]] ; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}} ; SI: s_endpgm -define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = xor i64 %loada, -8 store i64 %or, i64 addrspace(1)* %out @@ -243,7 +243,7 @@ define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]] ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]] ; SI: s_endpgm -define void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { +define amdgpu_kernel void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 %or = xor i64 %loada, 22470723082367 store i64 %or, i64 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll index 572099617605..f256d89f0cb7 100644 --- a/test/CodeGen/AMDGPU/zero_extend.ll +++ b/test/CodeGen/AMDGPU/zero_extend.ll @@ -9,7 +9,7 @@ ; SI: {{^}}s_mad_zext_i32_to_i64: ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}} ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} -define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 { +define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 { entry: %tmp0 = mul i32 %a, %b %tmp1 = add i32 %tmp0, %c @@ -20,7 +20,7 @@ entry: ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32 ; SI: v_cndmask_b32 -define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: %tmp0 = icmp eq i32 %a, %b %tmp1 = zext i1 %tmp0 to i32 @@ -29,7 +29,7 @@ entry: } ; SI-LABEL: {{^}}s_arg_zext_i1_to_i64: -define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 { +define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 { %ext = zext i1 %arg to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void @@ -39,7 +39,7 @@ define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 { ; SI: s_mov_b32 s{{[0-9]+}}, 0 ; SI: v_cmp_eq_u32 ; SI: v_cndmask_b32 -define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp eq i32 %a, %b %ext = zext i1 %cmp to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 @@ -49,7 +49,7 @@ define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; SI: buffer_store_short [[RESULT]] -define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { +define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { %tmp0 = icmp eq i16 %a, %b %tmp1 = zext i1 %tmp0 to i16 store i16 %tmp1, i16 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll index 842c30b40df2..a902234898cd 100644 --- a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll +++ b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll @@ -11,7 +11,7 @@ ; GCN-NOT: v[[HI]] ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) { +define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) { %ld.64 = load volatile i64, i64 addrspace(1)* %in0 %ld.32 = load volatile i32, i32 addrspace(1)* %in1 %ext = zext i32 %ld.32 to i64 @@ -31,7 +31,7 @@ define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, ; GCN-NOT: _or_ ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) { +define amdgpu_kernel void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) { %ld.64 = load volatile i64, i64 addrspace(1)* %in0 %ld.32 = load volatile i32, i32 addrspace(1)* %in1 %ext = zext i32 %ld.32 to i64 diff --git a/test/CodeGen/AMDGPU/zext-lid.ll b/test/CodeGen/AMDGPU/zext-lid.ll new file mode 100644 index 000000000000..8eeff53ff99f --- /dev/null +++ b/test/CodeGen/AMDGPU/zext-lid.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s + +; CHECK-NOT: and_b32 + +; OPT-LABEL: @zext_grp_size_128 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !0 +define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp1 = and i32 %tmp, 127 + store i32 %tmp1, i32 addrspace(1)* %arg, align 4 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp3 = and i32 %tmp2, 127 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp6 = and i32 %tmp5, 127 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 + ret void +} + +; OPT-LABEL: @zext_grp_size_32x4x1 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !2 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !3 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !4 +define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp1 = and i32 %tmp, 31 + store i32 %tmp1, i32 addrspace(1)* %arg, align 4 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp3 = and i32 %tmp2, 3 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp6 = and i32 %tmp5, 1 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 + ret void +} + +; OPT-LABEL: @zext_grp_size_512 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !5 +define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp1 = and i32 %tmp, 65535 + store i32 %tmp1, i32 addrspace(1)* %arg, align 4 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp3 = and i32 %tmp2, 65535 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp6 = and i32 %tmp5, 65535 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +declare i32 @llvm.amdgcn.workitem.id.y() #2 + +declare i32 @llvm.amdgcn.workitem.id.z() #2 + +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,128" } +attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" } +attributes #2 = { nounwind readnone } + +!0 = !{i32 32, i32 4, i32 1} + +; OPT: !0 = !{i32 0, i32 128} +; OPT: !1 = !{i32 32, i32 4, i32 1} +; OPT: !2 = !{i32 0, i32 32} +; OPT: !3 = !{i32 0, i32 4} +; OPT: !4 = !{i32 0, i32 1} +; OPT: !5 = !{i32 0, i32 512} diff --git a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll index 52cc37e24084..b8f2980be750 100644 --- a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll +++ b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll @@ -12,11 +12,11 @@ ; CHECK: bl _quux ; CHECK-NOT: bl _quux -; NOMERGE: bl _baz -; NOMERGE: bl _baz +; NOMERGE-DAG: bl _baz +; NOMERGE-DAG: bl _baz -; NOMERGE: bl _quux -; NOMERGE: bl _quux +; NOMERGE-DAG: bl _quux +; NOMERGE-DAG: bl _quux ; ModuleID = 'tail.c' target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64" diff --git a/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll b/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll index 5d59fc64d922..e5c2fb4d67a1 100644 --- a/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll +++ b/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll @@ -1,5 +1,4 @@ ; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s ; PR4091 define void @foo(i32 %i, i32* %p) nounwind { diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll index 4a1341c4d6e7..2a5af6199a34 100644 --- a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll +++ b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll @@ -12,13 +12,14 @@ define void @test_byval_8_bytes_alignment(i32 %i, ...) { entry: ; CHECK: sub sp, sp, #12 ; CHECK: sub sp, sp, #4 -; CHECK: stmib sp, {r1, r2, r3} +; CHECK: add r0, sp, #4 +; CHECK: stm sp, {r0, r1, r2, r3} %g = alloca i8* %g1 = bitcast i8** %g to i8* call void @llvm.va_start(i8* %g1) ; CHECK: add [[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7 -; CHECK: bfc [[REG]], #0, #3 +; CHECK: bic [[REG]], [[REG]], #7 %0 = va_arg i8** %g, double call void @llvm.va_end(i8* %g1) diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 5c0853cfaab4..66d9033a6d7c 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -1,10 +1,135 @@ # RUN: llc -O0 -mtriple arm-- -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --- | + define void @test_zext_s1() { ret void } + define void @test_sext_s1() { ret void } + define void @test_sext_s8() { ret void } + define void @test_zext_s16() { ret void } + define void @test_add_s8() { ret void } define void @test_add_s16() { ret void } define void @test_add_s32() { ret void } + define void @test_fadd_s32() #0 { ret void } + define void @test_fadd_s64() #0 { ret void } + define void @test_load_from_stack() { ret void } + define void @test_load_f32() #0 { ret void } + define void @test_load_f64() #0 { ret void } + + define void @test_stores() #0 { ret void } + + define void @test_gep() { ret void } + define void @test_constants() { ret void } + + define void @test_soft_fp_double() #0 { ret void } + + attributes #0 = { "target-features"="+vfp2,-neonfp" } +... +--- +name: test_zext_s1 +# CHECK-LABEL: name: test_zext_s1 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s1) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = G_ZEXT %0(s1) + ; CHECK: [[VREGEXT:%[0-9]+]] = ANDri [[VREGX]], 1, 14, _, _ + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[VREGEXT]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_sext_s1 +# CHECK-LABEL: name: test_sext_s1 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s1) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = G_SEXT %0(s1) + ; CHECK: [[VREGAND:%[0-9]+]] = ANDri [[VREGX]], 1, 14, _, _ + ; CHECK: [[VREGEXT:%[0-9]+]] = RSBri [[VREGAND]], 0, 14, _, _ + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[VREGEXT]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_sext_s8 +# CHECK-LABEL: name: test_sext_s8 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s8) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = G_SEXT %0(s8) + ; CHECK: [[VREGEXT:%[0-9]+]] = SXTB [[VREGX]], 0, 14, _ + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[VREGEXT]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_zext_s16 +# CHECK-LABEL: name: test_zext_s16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s16) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = G_ZEXT %0(s16) + ; CHECK: [[VREGEXT:%[0-9]+]] = UXTH [[VREGX]], 0, 14, _ + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[VREGEXT]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 ... --- name: test_add_s8 @@ -106,6 +231,72 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_fadd_s32 +# CHECK-LABEL: name: test_fadd_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: fprb } +# CHECK: id: 0, class: spr +# CHECK: id: 1, class: spr +# CHECK: id: 2, class: spr +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s32) = G_FADD %0, %1 + ; CHECK: [[VREGSUM:%[0-9]+]] = VADDS [[VREGX]], [[VREGY]], 14, _ + + %s0 = COPY %2(s32) + ; CHECK: %s0 = COPY [[VREGSUM]] + + BX_RET 14, _, implicit %s0 + ; CHECK: BX_RET 14, _, implicit %s0 +... +--- +name: test_fadd_s64 +# CHECK-LABEL: name: test_fadd_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: fprb } +# CHECK: id: 0, class: dpr +# CHECK: id: 1, class: dpr +# CHECK: id: 2, class: dpr +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s64) = G_FADD %0, %1 + ; CHECK: [[VREGSUM:%[0-9]+]] = VADDD [[VREGX]], [[VREGY]], 14, _ + + %d0 = COPY %2(s64) + ; CHECK: %d0 = COPY [[VREGSUM]] + + BX_RET 14, _, implicit %d0 + ; CHECK: BX_RET 14, _, implicit %d0 +... +--- name: test_load_from_stack # CHECK-LABEL: name: test_load_from_stack legalized: true @@ -122,20 +313,225 @@ registers: # CHECK-DAG: id: 2, class: gpr # CHECK-DAG: id: 3, class: gpr fixedStack: - - { id: 0, offset: 0, size: 4, alignment: 4, isImmutable: true, isAliased: false } + - { id: 0, offset: 0, size: 1, alignment: 4, isImmutable: true, isAliased: false } - { id: 1, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false } - { id: 2, offset: 8, size: 4, alignment: 4, isImmutable: true, isAliased: false } -# CHECK: id: [[FRAME_INDEX:[0-9]+]], offset: 8 +# CHECK-DAG: id: [[FI1:[0-9]+]], offset: 0 +# CHECK-DAG: id: [[FI32:[0-9]+]], offset: 8 body: | bb.0: liveins: %r0, %r1, %r2, %r3 %0(p0) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[FIVREG:%[0-9]+]] = ADDri %fixed-stack.[[FRAME_INDEX]], 0, 14, _, _ + ; CHECK: [[FI32VREG:%[0-9]+]] = ADDri %fixed-stack.[[FI32]], 0, 14, _, _ + + %1(s32) = G_LOAD %0(p0) :: (load 4) + ; CHECK: [[LD32VREG:%[0-9]+]] = LDRi12 [[FI32VREG]], 0, 14, _ + + %r0 = COPY %1 + ; CHECK: %r0 = COPY [[LD32VREG]] + + %2(p0) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[FI1VREG:%[0-9]+]] = ADDri %fixed-stack.[[FI1]], 0, 14, _, _ - %1(s32) = G_LOAD %0(p0) - ; CHECK: {{%[0-9]+}} = LDRi12 [[FIVREG]], 0, 14, _ + %3(s1) = G_LOAD %2(p0) :: (load 1) + ; CHECK: [[LD1VREG:%[0-9]+]] = LDRBi12 [[FI1VREG]], 0, 14, _ + + %r0 = COPY %3 + ; CHECK: %r0 = COPY [[LD1VREG]] BX_RET 14, _ ; CHECK: BX_RET 14, _ ... +--- +name: test_load_f32 +# CHECK-LABEL: name: test_load_f32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: fprb } +# CHECK-DAG: id: [[P:[0-9]+]], class: gpr +# CHECK-DAG: id: [[V:[0-9]+]], class: spr +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(p0) = COPY %r0 + + %1(s32) = G_LOAD %0(p0) :: (load 4) + ; CHECK: %[[V]] = VLDRS %[[P]], 0, 14, _ + + %s0 = COPY %1 + ; CHECK: %s0 = COPY %[[V]] + + BX_RET 14, _, implicit %s0 + ; CHECK: BX_RET 14, _, implicit %s0 +... +--- +name: test_load_f64 +# CHECK-LABEL: name: test_load_f64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: fprb } +# CHECK-DAG: id: [[P:[0-9]+]], class: gpr +# CHECK-DAG: id: [[V:[0-9]+]], class: dpr +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(p0) = COPY %r0 + + %1(s64) = G_LOAD %0(p0) :: (load 8) + ; CHECK: %[[V]] = VLDRD %[[P]], 0, 14, _ + + %d0 = COPY %1 + ; CHECK: %d0 = COPY %[[V]] + + BX_RET 14, _, implicit %d0 + ; CHECK: BX_RET 14, _, implicit %d0 +... +--- +name: test_stores +# CHECK-LABEL: name: test_stores +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: fprb } + - { id: 5, class: fprb } +# CHECK: id: [[P:[0-9]+]], class: gpr +# CHECK: id: [[I8:[0-9]+]], class: gpr +# CHECK: id: [[I16:[0-9]+]], class: gpr +# CHECK: id: [[I32:[0-9]+]], class: gpr +# CHECK: id: [[F32:[0-9]+]], class: spr +# CHECK: id: [[F64:[0-9]+]], class: dpr +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(p0) = COPY %r0 + %1(s8) = COPY %r3 + %2(s16) = COPY %r2 + %3(s32) = COPY %r1 + %4(s32) = COPY %s0 + %5(s64) = COPY %d2 + + G_STORE %1(s8), %0(p0) :: (store 1) + ; CHECK: STRBi12 %[[I8]], %[[P]], 0, 14, _ + + G_STORE %2(s16), %0(p0) :: (store 2) + ; CHECK: STRH %[[I16]], %[[P]], _, 0, 14, _ + + G_STORE %3(s32), %0(p0) :: (store 4) + ; CHECK: STRi12 %[[I32]], %[[P]], 0, 14, _ + + G_STORE %4(s32), %0(p0) :: (store 4) + ; CHECK: VSTRS %[[F32]], %[[P]], 0, 14, _ + + G_STORE %5(s64), %0(p0) :: (store 8) + ; CHECK: VSTRD %[[F64]], %[[P]], 0, 14, _ + + BX_RET 14, _ +... +--- +name: test_gep +# CHECK-LABEL: name: test_gep +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK: id: [[PTR:[0-9]+]], class: gpr +# CHECK: id: [[OFF:[0-9]+]], class: gpr +# CHECK: id: [[GEP:[0-9]+]], class: gpr +body: | + bb.0: + liveins: %r0, %r1 + + %0(p0) = COPY %r0 + %1(s32) = COPY %r1 + + %2(p0) = G_GEP %0, %1(s32) + ; CHECK: %[[GEP]] = ADDrr %[[PTR]], %[[OFF]], 14, _, _ + + %r0 = COPY %2(p0) + BX_RET 14, _, implicit %r0 +... +--- +name: test_constants +# CHECK-LABEL: name: test_constants +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } +# CHECK: id: [[C:[0-9]+]], class: gpr +body: | + bb.0: + %0(s32) = G_CONSTANT 42 + ; CHECK: %[[C]] = MOVi 42, 14, _, _ + + %r0 = COPY %0(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_soft_fp_double +# CHECK-LABEL: name: test_soft_fp_double +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: fprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } +# CHECK-DAG: id: {{[0-9]+}}, class: gpr +# CHECK-DAG: id: {{[0-9]+}}, class: gpr +# CHECK-DAG: id: {{[0-9]+}}, class: gpr +# CHECK-DAG: id: {{[0-9]+}}, class: gpr +# CHECK-DAG: id: [[DREG:[0-9]+]], class: dpr +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r2 + ; CHECK: [[IN1:%[0-9]+]] = COPY %r2 + + %1(s32) = COPY %r3 + ; CHECK: [[IN2:%[0-9]+]] = COPY %r3 + + %2(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 1 + ; CHECK: %[[DREG]] = VMOVDRR [[IN1]], [[IN2]] + + %3(s32) = G_EXTRACT %2(s64), 0 + %4(s32) = G_EXTRACT %2(s64), 32 + ; CHECK: [[OUT1:%[0-9]+]] = VGETLNi32 %[[DREG]], 0 + ; CHECK: [[OUT2:%[0-9]+]] = VGETLNi32 %[[DREG]], 1 + + %r0 = COPY %3 + ; CHECK: %r0 = COPY [[OUT1]] + + %r1 = COPY %4 + ; CHECK: %r1 = COPY [[OUT2]] + + BX_RET 14, _, implicit %r0, implicit %r1 + ; CHECK: BX_RET 14, _, implicit %r0, implicit %r1 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll index f863ed5a6849..a7f5ec33bee3 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple arm-unknown -global-isel -stop-after=irtranslator %s -o - | FileCheck %s +; RUN: llc -mtriple arm-unknown -mattr=+vfp2 -global-isel -stop-after=irtranslator %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LITTLE +; RUN: llc -mtriple armeb-unknown -mattr=+vfp2 -global-isel -stop-after=irtranslator %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=BIG define void @test_void_return() { ; CHECK-LABEL: name: test_void_return @@ -7,6 +8,20 @@ entry: ret void } +define signext i1 @test_add_i1(i1 %x, i1 %y) { +; CHECK-LABEL: name: test_add_i1 +; CHECK: liveins: %r0, %r1 +; CHECK-DAG: [[VREGX:%[0-9]+]](s1) = COPY %r0 +; CHECK-DAG: [[VREGY:%[0-9]+]](s1) = COPY %r1 +; CHECK: [[SUM:%[0-9]+]](s1) = G_ADD [[VREGX]], [[VREGY]] +; CHECK: [[EXT:%[0-9]+]](s32) = G_SEXT [[SUM]] +; CHECK: %r0 = COPY [[EXT]](s32) +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %sum = add i1 %x, %y + ret i1 %sum +} + define i8 @test_add_i8(i8 %x, i8 %y) { ; CHECK-LABEL: name: test_add_i8 ; CHECK: liveins: %r0, %r1 @@ -20,6 +35,17 @@ entry: ret i8 %sum } +define signext i8 @test_return_sext_i8(i8 %x) { +; CHECK-LABEL: name: test_return_sext_i8 +; CHECK: liveins: %r0 +; CHECK: [[VREG:%[0-9]+]](s8) = COPY %r0 +; CHECK: [[VREGEXT:%[0-9]+]](s32) = G_SEXT [[VREG]] +; CHECK: %r0 = COPY [[VREGEXT]](s32) +; CHECK: BX_RET 14, _, implicit %r0 +entry: + ret i8 %x +} + define i16 @test_add_i16(i16 %x, i16 %y) { ; CHECK-LABEL: name: test_add_i16 ; CHECK: liveins: %r0, %r1 @@ -33,6 +59,17 @@ entry: ret i16 %sum } +define zeroext i16 @test_return_zext_i16(i16 %x) { +; CHECK-LABEL: name: test_return_zext_i16 +; CHECK: liveins: %r0 +; CHECK: [[VREG:%[0-9]+]](s16) = COPY %r0 +; CHECK: [[VREGEXT:%[0-9]+]](s32) = G_ZEXT [[VREG]] +; CHECK: %r0 = COPY [[VREGEXT]](s32) +; CHECK: BX_RET 14, _, implicit %r0 +entry: + ret i16 %x +} + define i32 @test_add_i32(i32 %x, i32 %y) { ; CHECK-LABEL: name: test_add_i32 ; CHECK: liveins: %r0, %r1 @@ -46,8 +83,8 @@ entry: ret i32 %sum } -define i32 @test_many_args(i32 %p0, i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) { -; CHECK-LABEL: name: test_many_args +define i32 @test_stack_args(i32 %p0, i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) { +; CHECK-LABEL: name: test_stack_args ; CHECK: fixedStack: ; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 4 ; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 4 @@ -62,3 +99,527 @@ entry: %sum = add i32 %p2, %p5 ret i32 %sum } + +define i16 @test_stack_args_signext(i32 %p0, i16 %p1, i8 %p2, i1 %p3, + i8 signext %p4, i16 signext %p5) { +; CHECK-LABEL: name: test_stack_args_signext +; CHECK: fixedStack: +; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1 +; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2 +; CHECK: liveins: %r0, %r1, %r2, %r3 +; CHECK: [[VREGP1:%[0-9]+]]{{.*}} = COPY %r1 +; CHECK: [[FIP5:%[0-9]+]]{{.*}} = G_FRAME_INDEX %fixed-stack.[[P5]] +; CHECK: [[VREGP5:%[0-9]+]]{{.*}} = G_LOAD [[FIP5]](p0) +; CHECK: [[SUM:%[0-9]+]]{{.*}} = G_ADD [[VREGP1]], [[VREGP5]] +; CHECK: %r0 = COPY [[SUM]] +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %sum = add i16 %p1, %p5 + ret i16 %sum +} + +define i8 @test_stack_args_zeroext(i32 %p0, i16 %p1, i8 %p2, i1 %p3, + i8 zeroext %p4, i16 zeroext %p5) { +; CHECK-LABEL: name: test_stack_args_zeroext +; CHECK: fixedStack: +; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1 +; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2 +; CHECK: liveins: %r0, %r1, %r2, %r3 +; CHECK: [[VREGP2:%[0-9]+]]{{.*}} = COPY %r2 +; CHECK: [[FIP4:%[0-9]+]]{{.*}} = G_FRAME_INDEX %fixed-stack.[[P4]] +; CHECK: [[VREGP4:%[0-9]+]]{{.*}} = G_LOAD [[FIP4]](p0) +; CHECK: [[SUM:%[0-9]+]]{{.*}} = G_ADD [[VREGP2]], [[VREGP4]] +; CHECK: %r0 = COPY [[SUM]] +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %sum = add i8 %p2, %p4 + ret i8 %sum +} + +define i16 @test_ptr_arg(i16* %p) { +; CHECK-LABEL: name: test_ptr_arg +; CHECK: liveins: %r0 +; CHECK: [[VREGP:%[0-9]+]](p0) = COPY %r0 +; CHECK: [[VREGV:%[0-9]+]](s16) = G_LOAD [[VREGP]](p0) +entry: + %v = load i16, i16* %p + ret i16 %v +} + +define i32* @test_ptr_ret(i32** %p) { +; Test pointer returns and pointer-to-pointer arguments +; CHECK-LABEL: name: test_ptr_ret +; CHECK: liveins: %r0 +; CHECK: [[VREGP:%[0-9]+]](p0) = COPY %r0 +; CHECK: [[VREGV:%[0-9]+]](p0) = G_LOAD [[VREGP]](p0) +; CHECK: %r0 = COPY [[VREGV]] +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %v = load i32*, i32** %p + ret i32* %v +} + +define i32 @test_ptr_arg_on_stack(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32* %p) { +; CHECK-LABEL: name: test_ptr_arg_on_stack +; CHECK: fixedStack: +; CHECK: id: [[P:[0-9]+]]{{.*}}offset: 0{{.*}}size: 4 +; CHECK: liveins: %r0, %r1, %r2, %r3 +; CHECK: [[FIP:%[0-9]+]]{{.*}} = G_FRAME_INDEX %fixed-stack.[[P]] +; CHECK: [[VREGP:%[0-9]+]](p0) = G_LOAD [[FIP]](p0) +; CHECK: [[VREGV:%[0-9]+]](s32) = G_LOAD [[VREGP]](p0) +; CHECK: %r0 = COPY [[VREGV]] +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %v = load i32, i32* %p + ret i32 %v +} + +define arm_aapcscc float @test_float_aapcscc(float %p0, float %p1, float %p2, + float %p3, float %p4, float %p5) { +; CHECK-LABEL: name: test_float_aapcscc +; CHECK: fixedStack: +; CHECK-DAG: id: [[P4:[0-9]+]]{{.*}}offset: 0{{.*}}size: 4 +; CHECK-DAG: id: [[P5:[0-9]+]]{{.*}}offset: 4{{.*}}size: 4 +; CHECK: liveins: %r0, %r1, %r2, %r3 +; CHECK: [[VREGP1:%[0-9]+]](s32) = COPY %r1 +; CHECK: [[FIP5:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[P5]] +; CHECK: [[VREGP5:%[0-9]+]](s32) = G_LOAD [[FIP5]](p0) +; CHECK: [[VREGV:%[0-9]+]](s32) = G_FADD [[VREGP1]], [[VREGP5]] +; CHECK: %r0 = COPY [[VREGV]] +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %v = fadd float %p1, %p5 + ret float %v +} + +define arm_aapcs_vfpcc float @test_float_vfpcc(float %p0, float %p1, float %p2, + float %p3, float %p4, float %p5, + float %ridiculous, + float %number, + float %of, + float %parameters, + float %that, + float %should, + float %never, + float %exist, + float %in, + float %practice, + float %q0, float %q1) { +; CHECK-LABEL: name: test_float_vfpcc +; CHECK: fixedStack: +; CHECK-DAG: id: [[Q0:[0-9]+]]{{.*}}offset: 0{{.*}}size: 4 +; CHECK-DAG: id: [[Q1:[0-9]+]]{{.*}}offset: 4{{.*}}size: 4 +; CHECK: liveins: %s0, %s1, %s2, %s3, %s4, %s5, %s6, %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15 +; CHECK: [[VREGP1:%[0-9]+]](s32) = COPY %s1 +; CHECK: [[FIQ1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Q1]] +; CHECK: [[VREGQ1:%[0-9]+]](s32) = G_LOAD [[FIQ1]](p0) +; CHECK: [[VREGV:%[0-9]+]](s32) = G_FADD [[VREGP1]], [[VREGQ1]] +; CHECK: %s0 = COPY [[VREGV]] +; CHECK: BX_RET 14, _, implicit %s0 +entry: + %v = fadd float %p1, %q1 + ret float %v +} + +define arm_aapcs_vfpcc double @test_double_vfpcc(double %p0, double %p1, double %p2, + double %p3, double %p4, double %p5, + double %reasonable, + double %parameters, + double %q0, double %q1) { +; CHECK-LABEL: name: test_double_vfpcc +; CHECK: fixedStack: +; CHECK-DAG: id: [[Q0:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8 +; CHECK-DAG: id: [[Q1:[0-9]+]]{{.*}}offset: 8{{.*}}size: 8 +; CHECK: liveins: %d0, %d1, %d2, %d3, %d4, %d5, %d6, %d7 +; CHECK: [[VREGP1:%[0-9]+]](s64) = COPY %d1 +; CHECK: [[FIQ1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Q1]] +; CHECK: [[VREGQ1:%[0-9]+]](s64) = G_LOAD [[FIQ1]](p0) +; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP1]], [[VREGQ1]] +; CHECK: %d0 = COPY [[VREGV]] +; CHECK: BX_RET 14, _, implicit %d0 +entry: + %v = fadd double %p1, %q1 + ret double %v +} + +define arm_aapcscc double @test_double_aapcscc(double %p0, double %p1, double %p2, + double %p3, double %p4, double %p5) { +; CHECK-LABEL: name: test_double_aapcscc +; CHECK: fixedStack: +; CHECK-DAG: id: [[P2:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8 +; CHECK-DAG: id: [[P3:[0-9]+]]{{.*}}offset: 8{{.*}}size: 8 +; CHECK-DAG: id: [[P4:[0-9]+]]{{.*}}offset: 16{{.*}}size: 8 +; CHECK-DAG: id: [[P5:[0-9]+]]{{.*}}offset: 24{{.*}}size: 8 +; CHECK: liveins: %r0, %r1, %r2, %r3 +; CHECK-DAG: [[VREGP1LO:%[0-9]+]](s32) = COPY %r2 +; CHECK-DAG: [[VREGP1HI:%[0-9]+]](s32) = COPY %r3 +; LITTLE: [[VREGP1:%[0-9]+]](s64) = G_SEQUENCE [[VREGP1LO]](s32), 0, [[VREGP1HI]](s32), 32 +; BIG: [[VREGP1:%[0-9]+]](s64) = G_SEQUENCE [[VREGP1HI]](s32), 0, [[VREGP1LO]](s32), 32 +; CHECK: [[FIP5:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[P5]] +; CHECK: [[VREGP5:%[0-9]+]](s64) = G_LOAD [[FIP5]](p0) +; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP1]], [[VREGP5]] +; LITTLE: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0 +; LITTLE: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32 +; BIG: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0 +; BIG: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32 +; CHECK-DAG: %r0 = COPY [[VREGVLO]] +; CHECK-DAG: %r1 = COPY [[VREGVHI]] +; CHECK: BX_RET 14, _, implicit %r0, implicit %r1 +entry: + %v = fadd double %p1, %p5 + ret double %v +} + +define arm_aapcs_vfpcc double @test_double_gap_vfpcc(double %p0, float %filler, + double %p1, double %p2, + double %p3, double %p4, + double %reasonable, + double %parameters, + double %q0, double %q1) { +; CHECK-LABEL: name: test_double_gap_vfpcc +; CHECK: fixedStack: +; CHECK-DAG: id: [[Q0:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8 +; CHECK-DAG: id: [[Q1:[0-9]+]]{{.*}}offset: 8{{.*}}size: 8 +; CHECK: liveins: %d0, %d2, %d3, %d4, %d5, %d6, %d7, %s2 +; CHECK: [[VREGP1:%[0-9]+]](s64) = COPY %d2 +; CHECK: [[FIQ1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Q1]] +; CHECK: [[VREGQ1:%[0-9]+]](s64) = G_LOAD [[FIQ1]](p0) +; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP1]], [[VREGQ1]] +; CHECK: %d0 = COPY [[VREGV]] +; CHECK: BX_RET 14, _, implicit %d0 +entry: + %v = fadd double %p1, %q1 + ret double %v +} + +define arm_aapcscc double @test_double_gap_aapcscc(float %filler, double %p0, + double %p1) { +; CHECK-LABEL: name: test_double_gap_aapcscc +; CHECK: fixedStack: +; CHECK-DAG: id: [[P1:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8 +; CHECK: liveins: %r0, %r2, %r3 +; CHECK-DAG: [[VREGP0LO:%[0-9]+]](s32) = COPY %r2 +; CHECK-DAG: [[VREGP0HI:%[0-9]+]](s32) = COPY %r3 +; LITTLE: [[VREGP0:%[0-9]+]](s64) = G_SEQUENCE [[VREGP0LO]](s32), 0, [[VREGP0HI]](s32), 32 +; BIG: [[VREGP0:%[0-9]+]](s64) = G_SEQUENCE [[VREGP0HI]](s32), 0, [[VREGP0LO]](s32), 32 +; CHECK: [[FIP1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[P1]] +; CHECK: [[VREGP1:%[0-9]+]](s64) = G_LOAD [[FIP1]](p0) +; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP0]], [[VREGP1]] +; LITTLE: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0 +; LITTLE: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32 +; BIG: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0 +; BIG: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32 +; CHECK-DAG: %r0 = COPY [[VREGVLO]] +; CHECK-DAG: %r1 = COPY [[VREGVHI]] +; CHECK: BX_RET 14, _, implicit %r0, implicit %r1 +entry: + %v = fadd double %p0, %p1 + ret double %v +} + +define arm_aapcscc double @test_double_gap2_aapcscc(double %p0, float %filler, + double %p1) { +; CHECK-LABEL: name: test_double_gap2_aapcscc +; CHECK: fixedStack: +; CHECK-DAG: id: [[P1:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8 +; CHECK: liveins: %r0, %r1, %r2 +; CHECK-DAG: [[VREGP0LO:%[0-9]+]](s32) = COPY %r0 +; CHECK-DAG: [[VREGP0HI:%[0-9]+]](s32) = COPY %r1 +; LITTLE: [[VREGP0:%[0-9]+]](s64) = G_SEQUENCE [[VREGP0LO]](s32), 0, [[VREGP0HI]](s32), 32 +; BIG: [[VREGP0:%[0-9]+]](s64) = G_SEQUENCE [[VREGP0HI]](s32), 0, [[VREGP0LO]](s32), 32 +; CHECK: [[FIP1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[P1]] +; CHECK: [[VREGP1:%[0-9]+]](s64) = G_LOAD [[FIP1]](p0) +; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP0]], [[VREGP1]] +; LITTLE: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0 +; LITTLE: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32 +; BIG: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0 +; BIG: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32 +; CHECK-DAG: %r0 = COPY [[VREGVLO]] +; CHECK-DAG: %r1 = COPY [[VREGVHI]] +; CHECK: BX_RET 14, _, implicit %r0, implicit %r1 +entry: + %v = fadd double %p0, %p1 + ret double %v +} + +define arm_aapcscc void @test_indirect_call(void() *%fptr) { +; CHECK-LABEL: name: test_indirect_call +; CHECK: [[FPTR:%[0-9]+]](p0) = COPY %r0 +; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: BLX [[FPTR]](p0), csr_aapcs, implicit-def %lr, implicit %sp +; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp +entry: + notail call arm_aapcscc void %fptr() + ret void +} + +declare arm_aapcscc void @call_target() + +define arm_aapcscc void @test_direct_call() { +; CHECK-LABEL: name: test_direct_call +; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: BLX @call_target, csr_aapcs, implicit-def %lr, implicit %sp +; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp +entry: + notail call arm_aapcscc void @call_target() + ret void +} + +declare arm_aapcscc i32* @simple_reg_params_target(i32, i32*) + +define arm_aapcscc i32* @test_call_simple_reg_params(i32 *%a, i32 %b) { +; CHECK-LABEL: name: test_call_simple_reg_params +; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0 +; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1 +; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK-DAG: %r0 = COPY [[BVREG]] +; CHECK-DAG: %r1 = COPY [[AVREG]] +; CHECK: BLX @simple_reg_params_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0 +; CHECK: [[RVREG:%[0-9]+]](p0) = COPY %r0 +; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: %r0 = COPY [[RVREG]] +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %r = notail call arm_aapcscc i32 *@simple_reg_params_target(i32 %b, i32 *%a) + ret i32 *%r +} + +declare arm_aapcscc i32* @simple_stack_params_target(i32, i32*, i32, i32*, i32, i32*) + +define arm_aapcscc i32* @test_call_simple_stack_params(i32 *%a, i32 %b) { +; CHECK-LABEL: name: test_call_simple_stack_params +; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0 +; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1 +; CHECK: ADJCALLSTACKDOWN 8, 14, _, implicit-def %sp, implicit %sp +; CHECK-DAG: %r0 = COPY [[BVREG]] +; CHECK-DAG: %r1 = COPY [[AVREG]] +; CHECK-DAG: %r2 = COPY [[BVREG]] +; CHECK-DAG: %r3 = COPY [[AVREG]] +; CHECK: [[SP1:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF1:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[FI1:%[0-9]+]](p0) = G_GEP [[SP1]], [[OFF1]](s32) +; CHECK: G_STORE [[BVREG]](s32), [[FI1]](p0){{.*}}store 4 +; CHECK: [[SP2:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF2:%[0-9]+]](s32) = G_CONSTANT i32 4 +; CHECK: [[FI2:%[0-9]+]](p0) = G_GEP [[SP2]], [[OFF2]](s32) +; CHECK: G_STORE [[AVREG]](p0), [[FI2]](p0){{.*}}store 4 +; CHECK: BLX @simple_stack_params_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 +; CHECK: [[RVREG:%[0-9]+]](p0) = COPY %r0 +; CHECK: ADJCALLSTACKUP 8, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: %r0 = COPY [[RVREG]] +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %r = notail call arm_aapcscc i32 *@simple_stack_params_target(i32 %b, i32 *%a, i32 %b, i32 *%a, i32 %b, i32 *%a) + ret i32 *%r +} + +declare arm_aapcscc signext i16 @ext_target(i8 signext, i8 zeroext, i16 signext, i16 zeroext, i8 signext, i8 zeroext, i16 signext, i16 zeroext, i1 zeroext) + +define arm_aapcscc signext i16 @test_call_ext_params(i8 %a, i16 %b, i1 %c) { +; CHECK-LABEL: name: test_call_ext_params +; CHECK-DAG: [[AVREG:%[0-9]+]](s8) = COPY %r0 +; CHECK-DAG: [[BVREG:%[0-9]+]](s16) = COPY %r1 +; CHECK-DAG: [[CVREG:%[0-9]+]](s1) = COPY %r2 +; CHECK: ADJCALLSTACKDOWN 20, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[SEXTA:%[0-9]+]](s32) = G_SEXT [[AVREG]](s8) +; CHECK: %r0 = COPY [[SEXTA]] +; CHECK: [[ZEXTA:%[0-9]+]](s32) = G_ZEXT [[AVREG]](s8) +; CHECK: %r1 = COPY [[ZEXTA]] +; CHECK: [[SEXTB:%[0-9]+]](s32) = G_SEXT [[BVREG]](s16) +; CHECK: %r2 = COPY [[SEXTB]] +; CHECK: [[ZEXTB:%[0-9]+]](s32) = G_ZEXT [[BVREG]](s16) +; CHECK: %r3 = COPY [[ZEXTB]] +; CHECK: [[SP1:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF1:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[FI1:%[0-9]+]](p0) = G_GEP [[SP1]], [[OFF1]](s32) +; CHECK: [[SEXTA2:%[0-9]+]](s32) = G_SEXT [[AVREG]] +; CHECK: G_STORE [[SEXTA2]](s32), [[FI1]](p0){{.*}}store 4 +; CHECK: [[SP2:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF2:%[0-9]+]](s32) = G_CONSTANT i32 4 +; CHECK: [[FI2:%[0-9]+]](p0) = G_GEP [[SP2]], [[OFF2]](s32) +; CHECK: [[ZEXTA2:%[0-9]+]](s32) = G_ZEXT [[AVREG]] +; CHECK: G_STORE [[ZEXTA2]](s32), [[FI2]](p0){{.*}}store 4 +; CHECK: [[SP3:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF3:%[0-9]+]](s32) = G_CONSTANT i32 8 +; CHECK: [[FI3:%[0-9]+]](p0) = G_GEP [[SP3]], [[OFF3]](s32) +; CHECK: [[SEXTB2:%[0-9]+]](s32) = G_SEXT [[BVREG]] +; CHECK: G_STORE [[SEXTB2]](s32), [[FI3]](p0){{.*}}store 4 +; CHECK: [[SP4:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF4:%[0-9]+]](s32) = G_CONSTANT i32 12 +; CHECK: [[FI4:%[0-9]+]](p0) = G_GEP [[SP4]], [[OFF4]](s32) +; CHECK: [[ZEXTB2:%[0-9]+]](s32) = G_ZEXT [[BVREG]] +; CHECK: G_STORE [[ZEXTB2]](s32), [[FI4]](p0){{.*}}store 4 +; CHECK: [[SP5:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF5:%[0-9]+]](s32) = G_CONSTANT i32 16 +; CHECK: [[FI5:%[0-9]+]](p0) = G_GEP [[SP5]], [[OFF5]](s32) +; CHECK: [[ZEXTC:%[0-9]+]](s32) = G_ZEXT [[CVREG]] +; CHECK: G_STORE [[ZEXTC]](s32), [[FI5]](p0){{.*}}store 4 +; CHECK: BLX @ext_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 +; CHECK: [[RVREG:%[0-9]+]](s16) = COPY %r0 +; CHECK: ADJCALLSTACKUP 20, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[RExtVREG:%[0-9]+]](s32) = G_SEXT [[RVREG]] +; CHECK: %r0 = COPY [[RExtVREG]] +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %r = notail call arm_aapcscc signext i16 @ext_target(i8 signext %a, i8 zeroext %a, i16 signext %b, i16 zeroext %b, i8 signext %a, i8 zeroext %a, i16 signext %b, i16 zeroext %b, i1 zeroext %c) + ret i16 %r +} + +declare arm_aapcs_vfpcc double @vfpcc_fp_target(float, double) + +define arm_aapcs_vfpcc double @test_call_vfpcc_fp_params(double %a, float %b) { +; CHECK-LABEL: name: test_call_vfpcc_fp_params +; CHECK-DAG: [[AVREG:%[0-9]+]](s64) = COPY %d0 +; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %s2 +; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK-DAG: %s0 = COPY [[BVREG]] +; CHECK-DAG: %d1 = COPY [[AVREG]] +; CHECK: BLX @vfpcc_fp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %s0, implicit %d1, implicit-def %d0 +; CHECK: [[RVREG:%[0-9]+]](s64) = COPY %d0 +; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: %d0 = COPY [[RVREG]] +; CHECK: BX_RET 14, _, implicit %d0 +entry: + %r = notail call arm_aapcs_vfpcc double @vfpcc_fp_target(float %b, double %a) + ret double %r +} + +declare arm_aapcscc double @aapcscc_fp_target(float, double, float, double) + +define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) { +; CHECK-LABEL: name: test_call_aapcs_fp_params +; CHECK-DAG: [[A1:%[0-9]+]](s32) = COPY %r0 +; CHECK-DAG: [[A2:%[0-9]+]](s32) = COPY %r1 +; LITTLE-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A1]](s32), 0, [[A2]](s32), 32 +; BIG-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A2]](s32), 0, [[A1]](s32), 32 +; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r2 +; CHECK: ADJCALLSTACKDOWN 16, 14, _, implicit-def %sp, implicit %sp +; CHECK-DAG: %r0 = COPY [[BVREG]] +; CHECK-DAG: [[A1:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 0 +; CHECK-DAG: [[A2:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 32 +; LITTLE-DAG: %r2 = COPY [[A1]] +; LITTLE-DAG: %r3 = COPY [[A2]] +; BIG-DAG: %r2 = COPY [[A2]] +; BIG-DAG: %r3 = COPY [[A1]] +; CHECK: [[SP1:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF1:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[FI1:%[0-9]+]](p0) = G_GEP [[SP1]], [[OFF1]](s32) +; CHECK: G_STORE [[BVREG]](s32), [[FI1]](p0){{.*}}store 4 +; CHECK: [[SP2:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF2:%[0-9]+]](s32) = G_CONSTANT i32 8 +; CHECK: [[FI2:%[0-9]+]](p0) = G_GEP [[SP2]], [[OFF2]](s32) +; CHECK: G_STORE [[AVREG]](s64), [[FI2]](p0){{.*}}store 8 +; CHECK: BLX @aapcscc_fp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 +; CHECK-DAG: [[R1:%[0-9]+]](s32) = COPY %r0 +; CHECK-DAG: [[R2:%[0-9]+]](s32) = COPY %r1 +; LITTLE: [[RVREG:%[0-9]+]](s64) = G_SEQUENCE [[R1]](s32), 0, [[R2]](s32), 32 +; BIG: [[RVREG:%[0-9]+]](s64) = G_SEQUENCE [[R2]](s32), 0, [[R1]](s32), 32 +; CHECK: ADJCALLSTACKUP 16, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RVREG]](s64), 0 +; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[RVREG]](s64), 32 +; LITTLE-DAG: %r0 = COPY [[R1]] +; LITTLE-DAG: %r1 = COPY [[R2]] +; BIG-DAG: %r0 = COPY [[R2]] +; BIG-DAG: %r1 = COPY [[R1]] +; CHECK: BX_RET 14, _, implicit %r0, implicit %r1 +entry: + %r = notail call arm_aapcscc double @aapcscc_fp_target(float %b, double %a, float %b, double %a) + ret double %r +} + +declare arm_aapcscc float @different_call_conv_target(float) + +define arm_aapcs_vfpcc float @test_call_different_call_conv(float %x) { +; CHECK-LABEL: name: test_call_different_call_conv +; CHECK: [[X:%[0-9]+]](s32) = COPY %s0 +; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: %r0 = COPY [[X]] +; CHECK: BLX @different_call_conv_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit-def %r0 +; CHECK: [[R:%[0-9]+]](s32) = COPY %r0 +; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: %s0 = COPY [[R]] +; CHECK: BX_RET 14, _, implicit %s0 +entry: + %r = notail call arm_aapcscc float @different_call_conv_target(float %x) + ret float %r +} + +define i32 @test_shufflevector_s32_v2s32(i32 %arg) { +; CHECK-LABEL: name: test_shufflevector_s32_v2s32 +; CHECK: [[ARG:%[0-9]+]](s32) = COPY %r0 +; CHECK-DAG: [[UNDEF:%[0-9]+]](s32) = IMPLICIT_DEF +; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32) +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], [[MASK]](<2 x s32>) +; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>) + %vec = insertelement <1 x i32> undef, i32 %arg, i32 0 + %shuffle = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer + %res = extractelement <2 x i32> %shuffle, i32 0 + ret i32 %res +} + +define i32 @test_shufflevector_v2s32_v3s32(i32 %arg1, i32 %arg2) { +; CHECK-LABEL: name: test_shufflevector_v2s32_v3s32 +; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %r0 +; CHECK: [[ARG2:%[0-9]+]](s32) = COPY %r1 +; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF +; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK-DAG: [[MASK:%[0-9]+]](<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32), [[C1]](s32) +; CHECK-DAG: [[V1:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32) +; CHECK-DAG: [[V2:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32) +; CHECK: [[VEC:%[0-9]+]](<3 x s32>) = G_SHUFFLE_VECTOR [[V2]](<2 x s32>), [[UNDEF]], [[MASK]](<3 x s32>) +; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>) + %v1 = insertelement <2 x i32> undef, i32 %arg1, i32 0 + %v2 = insertelement <2 x i32> %v1, i32 %arg2, i32 1 + %shuffle = shufflevector <2 x i32> %v2, <2 x i32> undef, <3 x i32> + %res = extractelement <3 x i32> %shuffle, i32 0 + ret i32 %res +} + + +define i32 @test_shufflevector_v2s32_v4s32(i32 %arg1, i32 %arg2) { +; CHECK-LABEL: name: test_shufflevector_v2s32_v4s32 +; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %r0 +; CHECK: [[ARG2:%[0-9]+]](s32) = COPY %r1 +; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF +; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK-DAG: [[MASK:%[0-9]+]](<4 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32), [[C0]](s32), [[C0]](s32) +; CHECK-DAG: [[V1:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32) +; CHECK-DAG: [[V2:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32) +; CHECK: [[VEC:%[0-9]+]](<4 x s32>) = G_SHUFFLE_VECTOR [[V2]](<2 x s32>), [[UNDEF]], [[MASK]](<4 x s32>) +; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<4 x s32>) + %v1 = insertelement <2 x i32> undef, i32 %arg1, i32 0 + %v2 = insertelement <2 x i32> %v1, i32 %arg2, i32 1 + %shuffle = shufflevector <2 x i32> %v2, <2 x i32> undef, <4 x i32> zeroinitializer + %res = extractelement <4 x i32> %shuffle, i32 0 + ret i32 %res +} + +define i32 @test_shufflevector_v4s32_v2s32(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) { +; CHECK-LABEL: name: test_shufflevector_v4s32_v2s32 +; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %r0 +; CHECK: [[ARG2:%[0-9]+]](s32) = COPY %r1 +; CHECK: [[ARG3:%[0-9]+]](s32) = COPY %r2 +; CHECK: [[ARG4:%[0-9]+]](s32) = COPY %r3 +; CHECK-DAG: [[UNDEF:%[0-9]+]](<4 x s32>) = IMPLICIT_DEF +; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1 +; CHECK-DAG: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2 +; CHECK-DAG: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3 +; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C3]](s32) +; CHECK-DAG: [[V1:%[0-9]+]](<4 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32) +; CHECK-DAG: [[V2:%[0-9]+]](<4 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32) +; CHECK-DAG: [[V3:%[0-9]+]](<4 x s32>) = G_INSERT_VECTOR_ELT [[V2]], [[ARG3]](s32), [[C2]](s32) +; CHECK-DAG: [[V4:%[0-9]+]](<4 x s32>) = G_INSERT_VECTOR_ELT [[V3]], [[ARG4]](s32), [[C3]](s32) +; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[V4]](<4 x s32>), [[UNDEF]], [[MASK]](<2 x s32>) +; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>) + %v1 = insertelement <4 x i32> undef, i32 %arg1, i32 0 + %v2 = insertelement <4 x i32> %v1, i32 %arg2, i32 1 + %v3 = insertelement <4 x i32> %v2, i32 %arg3, i32 2 + %v4 = insertelement <4 x i32> %v3, i32 %arg4, i32 3 + %shuffle = shufflevector <4 x i32> %v4, <4 x i32> undef, <2 x i32> + %res = extractelement <2 x i32> %shuffle, i32 0 + ret i32 %res +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll new file mode 100644 index 000000000000..7d021fdb43dd --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll @@ -0,0 +1,51 @@ +; RUN: llc -mtriple arm-linux-gnueabihf -mattr=+vfp2 -float-abi=hard -global-isel %s -o - | FileCheck %s -check-prefix CHECK -check-prefix HARD +; RUN: llc -mtriple arm-linux-gnueabi -mattr=+vfp2,+soft-float -float-abi=soft -global-isel %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT-AEABI +; RUN: llc -mtriple arm-linux-gnu- -mattr=+vfp2,+soft-float -float-abi=soft -global-isel %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT-DEFAULT + +define arm_aapcscc float @test_frem_float(float %x, float %y) { +; CHECK-LABEL: test_frem_float: +; CHECK: blx fmodf + %r = frem float %x, %y + ret float %r +} + +define arm_aapcscc double @test_frem_double(double %x, double %y) { +; CHECK-LABEL: test_frem_double: +; CHECK: blx fmod + %r = frem double %x, %y + ret double %r +} + +declare float @llvm.pow.f32(float %x, float %y) +define arm_aapcscc float @test_fpow_float(float %x, float %y) { +; CHECK-LABEL: test_fpow_float: +; CHECK: blx powf + %r = call float @llvm.pow.f32(float %x, float %y) + ret float %r +} + +declare double @llvm.pow.f64(double %x, double %y) +define arm_aapcscc double @test_fpow_double(double %x, double %y) { +; CHECK-LABEL: test_fpow_double: +; CHECK: blx pow + %r = call double @llvm.pow.f64(double %x, double %y) + ret double %r +} + +define arm_aapcscc float @test_add_float(float %x, float %y) { +; CHECK-LABEL: test_add_float: +; HARD: vadd.f32 +; SOFT-AEABI: blx __aeabi_fadd +; SOFT-DEFAULT: blx __addsf3 + %r = fadd float %x, %y + ret float %r +} + +define arm_aapcscc double @test_add_double(double %x, double %y) { +; CHECK-LABEL: test_add_double: +; HARD: vadd.f64 +; SOFT-AEABI: blx __aeabi_dadd +; SOFT-DEFAULT: blx __adddf3 + %r = fadd double %x, %y + ret double %r +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel.ll b/test/CodeGen/ARM/GlobalISel/arm-isel.ll index 3f01b6dd3a83..236dcbeb84c5 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-isel.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-isel.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple arm-unknown -global-isel %s -o - | FileCheck %s +; RUN: llc -mtriple arm-unknown -mattr=+vfp2 -global-isel %s -o - | FileCheck %s define void @test_void_return() { ; CHECK-LABEL: test_void_return: @@ -7,6 +7,39 @@ entry: ret void } +define zeroext i1 @test_zext_i1(i1 %x) { +; CHECK-LABEL: test_zext_i1 +; CHECK: and r0, r0, #1 +; CHECK: bx lr +entry: + ret i1 %x +} + +define signext i1 @test_sext_i1(i1 %x) { +; CHECK-LABEL: test_sext_i1 +; CHECK: and r0, r0, #1 +; CHECK: rsb r0, r0, #0 +; CHECK: bx lr +entry: + ret i1 %x +} + +define zeroext i8 @test_ext_i8(i8 %x) { +; CHECK-LABEL: test_ext_i8: +; CHECK: uxtb r0, r0 +; CHECK: bx lr +entry: + ret i8 %x +} + +define signext i16 @test_ext_i16(i16 %x) { +; CHECK-LABEL: test_ext_i16: +; CHECK: sxth r0, r0 +; CHECK: bx lr +entry: + ret i16 %x +} + define i8 @test_add_i8(i8 %x, i8 %y) { ; CHECK-LABEL: test_add_i8: ; CHECK: add r0, r0, r1 @@ -34,8 +67,8 @@ entry: ret i32 %sum } -define i32 @test_many_args(i32 %p0, i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) { -; CHECK-LABEL: test_many_args: +define i32 @test_stack_args_i32(i32 %p0, i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) { +; CHECK-LABEL: test_stack_args_i32: ; CHECK: add [[P5ADDR:r[0-9]+]], sp, #4 ; CHECK: ldr [[P5:r[0-9]+]], {{.*}}[[P5ADDR]] ; CHECK: add r0, r2, [[P5]] @@ -44,3 +77,108 @@ entry: %sum = add i32 %p2, %p5 ret i32 %sum } + +define i16 @test_stack_args_mixed(i32 %p0, i16 %p1, i8 %p2, i1 %p3, i8 %p4, i16 %p5) { +; CHECK-LABEL: test_stack_args_mixed: +; CHECK: add [[P5ADDR:r[0-9]+]], sp, #4 +; CHECK: ldrh [[P5:r[0-9]+]], {{.*}}[[P5ADDR]] +; CHECK: add r0, r1, [[P5]] +; CHECK: bx lr +entry: + %sum = add i16 %p1, %p5 + ret i16 %sum +} + +define i16 @test_stack_args_zeroext(i32 %p0, i16 %p1, i8 %p2, i1 %p3, i16 zeroext %p4) { +; CHECK-LABEL: test_stack_args_zeroext: +; CHECK: mov [[P4ADDR:r[0-9]+]], sp +; CHECK: ldr [[P4:r[0-9]+]], {{.*}}[[P4ADDR]] +; CHECK: add r0, r1, [[P4]] +; CHECK: bx lr +entry: + %sum = add i16 %p1, %p4 + ret i16 %sum +} + +define i8 @test_stack_args_signext(i32 %p0, i16 %p1, i8 %p2, i1 %p3, i8 signext %p4) { +; CHECK-LABEL: test_stack_args_signext: +; CHECK: mov [[P4ADDR:r[0-9]+]], sp +; CHECK: ldr [[P4:r[0-9]+]], {{.*}}[[P4ADDR]] +; CHECK: add r0, r2, [[P4]] +; CHECK: bx lr +entry: + %sum = add i8 %p2, %p4 + ret i8 %sum +} + +define i32 @test_ptr_arg_in_reg(i32* %p) { +; CHECK-LABEL: test_ptr_arg_in_reg: +; CHECK: ldr r0, [r0] +; CHECK: bx lr +entry: + %v = load i32, i32* %p + ret i32 %v +} + +define i32 @test_ptr_arg_on_stack(i32 %f0, i32 %f1, i32 %f2, i32 %f3, i32* %p) { +; CHECK-LABEL: test_ptr_arg_on_stack: +; CHECK: mov r0, sp +; CHECK: ldr r0, [r0] +; CHECK: ldr r0, [r0] +; CHECK: bx lr +entry: + %v = load i32, i32* %p + ret i32 %v +} + +define i8* @test_ptr_ret(i8** %p) { +; CHECK-LABEL: test_ptr_ret: +; CHECK: ldr r0, [r0] +; CHECK: bx lr +entry: + %v = load i8*, i8** %p + ret i8* %v +} + +define arm_aapcs_vfpcc float @test_float_hard(float %f0, float %f1) { +; CHECK-LABEL: test_float_hard: +; CHECK: vadd.f32 s0, s0, s1 +; CHECK: bx lr +entry: + %v = fadd float %f0, %f1 + ret float %v +} + +define arm_aapcscc float @test_float_softfp(float %f0, float %f1) { +; CHECK-LABEL: test_float_softfp: +; CHECK-DAG: vmov [[F0:s[0-9]+]], r0 +; CHECK-DAG: vmov [[F1:s[0-9]+]], r1 +; CHECK: vadd.f32 [[FV:s[0-9]+]], [[F0]], [[F1]] +; CHECK: vmov r0, [[FV]] +; CHECK: bx lr +entry: + %v = fadd float %f0, %f1 + ret float %v +} + +define arm_aapcs_vfpcc double @test_double_hard(double %f0, double %f1) { +; CHECK-LABEL: test_double_hard: +; CHECK: vadd.f64 d0, d0, d1 +; CHECK: bx lr +entry: + %v = fadd double %f0, %f1 + ret double %v +} + +define arm_aapcscc double @test_double_softfp(double %f0, double %f1) { +; CHECK-LABEL: test_double_softfp: +; CHECK-DAG: vmov [[F0:d[0-9]+]], r0, r1 +; CHECK-DAG: vmov [[F1:d[0-9]+]], r2, r3 +; CHECK: vadd.f64 [[FV:d[0-9]+]], [[F0]], [[F1]] +; CHECK: vmov.32 r0, [[FV]][0] +; CHECK: vmov.32 r1, [[FV]][1] +; CHECK: bx lr +entry: + %v = fadd double %f0, %f1 + ret double %v +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir new file mode 100644 index 000000000000..d154b4887c19 --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir @@ -0,0 +1,282 @@ +# RUN: llc -mtriple arm-linux-gnueabihf -mattr=+vfp2 -float-abi=hard -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefix CHECK -check-prefix HARD +# RUN: llc -mtriple arm-linux-gnueabi -mattr=+vfp2,+soft-float -float-abi=soft -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT -check-prefix SOFT-AEABI +# RUN: llc -mtriple arm-linux-gnu -mattr=+soft-float -float-abi=soft -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT -check-prefix SOFT-DEFAULT +--- | + define void @test_frem_float() { ret void } + define void @test_frem_double() { ret void } + + define void @test_fpow_float() { ret void } + define void @test_fpow_double() { ret void } + + define void @test_fadd_float() { ret void } + define void @test_fadd_double() { ret void } +... +--- +name: test_frem_float +# CHECK-LABEL: name: test_frem_float +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; HARD-DAG: %s0 = COPY [[X]] + ; HARD-DAG: %s1 = COPY [[Y]] + ; SOFT: BLX $fmodf, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; HARD: BLX $fmodf, {{.*}}, implicit %s0, implicit %s1, implicit-def %s0 + ; SOFT: [[R:%[0-9]+]](s32) = COPY %r0 + ; HARD: [[R:%[0-9]+]](s32) = COPY %s0 + ; CHECK: ADJCALLSTACKUP + %2(s32) = G_FREM %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_frem_double +# CHECK-LABEL: name: test_frem_double +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } + - { id: 8, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + ; The inputs may be in the wrong order (depending on the target's + ; endianness), but that's orthogonal to what we're trying to test here. + ; For soft float, we only need to check that the first value, received + ; through R0-R1, ends up in R0-R1 or R1-R0, and the second value, received + ; through R2-R3, ends up in R2-R3 or R3-R2, when passed to fmod. + ; For hard float, the values need to end up in D0 and D1. + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_SEQUENCE [[X0]] + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_SEQUENCE [[Y0]] + %4(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 32 + %5(s64) = G_SEQUENCE %2(s32), 0, %3(s32), 32 + ; CHECK: ADJCALLSTACKDOWN + ; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]] + ; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]] + ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y0]] + ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y1]] + ; HARD-DAG: %d0 = COPY [[X]] + ; HARD-DAG: %d1 = COPY [[Y]] + ; SOFT: BLX $fmod, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 + ; HARD: BLX $fmod, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0 + ; CHECK: ADJCALLSTACKUP + %6(s64) = G_FREM %4, %5 + %7(s32) = G_EXTRACT %6(s64), 0 + %8(s32) = G_EXTRACT %6(s64), 32 + %r0 = COPY %7(s32) + %r1 = COPY %8(s32) + BX_RET 14, _, implicit %r0, implicit %r1 +... +--- +name: test_fpow_float +# CHECK-LABEL: name: test_fpow_float +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; HARD-DAG: %s0 = COPY [[X]] + ; HARD-DAG: %s1 = COPY [[Y]] + ; SOFT: BLX $powf, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; HARD: BLX $powf, {{.*}}, implicit %s0, implicit %s1, implicit-def %s0 + ; SOFT: [[R:%[0-9]+]](s32) = COPY %r0 + ; HARD: [[R:%[0-9]+]](s32) = COPY %s0 + ; CHECK: ADJCALLSTACKUP + %2(s32) = G_FPOW %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_fpow_double +# CHECK-LABEL: name: test_fpow_double +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } + - { id: 8, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + ; The inputs may be in the wrong order (depending on the target's + ; endianness), but that's orthogonal to what we're trying to test here. + ; For soft float, we only need to check that the first value, received + ; through R0-R1, ends up in R0-R1 or R1-R0, and the second value, received + ; through R2-R3, ends up in R2-R3 or R3-R2, when passed to pow. + ; For hard float, the values need to end up in D0 and D1. + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_SEQUENCE [[X0]] + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_SEQUENCE [[Y0]] + %4(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 32 + %5(s64) = G_SEQUENCE %2(s32), 0, %3(s32), 32 + ; CHECK: ADJCALLSTACKDOWN + ; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]] + ; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]] + ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y0]] + ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y1]] + ; HARD-DAG: %d0 = COPY [[X]] + ; HARD-DAG: %d1 = COPY [[Y]] + ; SOFT: BLX $pow, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 + ; HARD: BLX $pow, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0 + ; CHECK: ADJCALLSTACKUP + %6(s64) = G_FPOW %4, %5 + %7(s32) = G_EXTRACT %6(s64), 0 + %8(s32) = G_EXTRACT %6(s64), 32 + %r0 = COPY %7(s32) + %r1 = COPY %8(s32) + BX_RET 14, _, implicit %r0, implicit %r1 +... +--- +name: test_fadd_float +# CHECK-LABEL: name: test_fadd_float +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; HARD: [[R:%[0-9]+]](s32) = G_FADD [[X]], [[Y]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fadd, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__addsf3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[R:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + %2(s32) = G_FADD %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_fadd_double +# CHECK-LABEL: name: test_fadd_double +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } + - { id: 8, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_SEQUENCE [[X0]] + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_SEQUENCE [[Y0]] + %4(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 32 + %5(s64) = G_SEQUENCE %2(s32), 0, %3(s32), 32 + ; HARD: [[R:%[0-9]+]](s64) = G_FADD [[X]], [[Y]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]] + ; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]] + ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y0]] + ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dadd, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 + ; SOFT-DEFAULT: BLX $__adddf3, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 + ; SOFT: ADJCALLSTACKUP + %6(s64) = G_FADD %4, %5 + ; HARD-DAG: G_EXTRACT [[R]](s64), 0 + ; HARD-DAG: G_EXTRACT [[R]](s64), 32 + %7(s32) = G_EXTRACT %6(s64), 0 + %8(s32) = G_EXTRACT %6(s64), 32 + %r0 = COPY %7(s32) + %r1 = COPY %8(s32) + BX_RET 14, _, implicit %r0, implicit %r1 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index 98d71c09e63b..cbff7e12fb77 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -1,10 +1,68 @@ # RUN: llc -mtriple arm-- -global-isel -run-pass=legalizer %s -o - | FileCheck %s --- | + define void @test_sext_s8() { ret void } + define void @test_zext_s16() { ret void } + define void @test_add_s8() { ret void } define void @test_add_s16() { ret void } define void @test_add_s32() { ret void } define void @test_load_from_stack() { ret void } + define void @test_legal_loads() #0 { ret void } + define void @test_legal_stores() #0 { ret void } + + define void @test_gep() { ret void } + + define void @test_constants() { ret void } + + define void @test_fadd_s32() #0 { ret void } + define void @test_fadd_s64() #0 { ret void } + + attributes #0 = { "target-features"="+vfp2" } +... +--- +name: test_sext_s8 +# CHECK-LABEL: name: test_sext_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s8) = COPY %r0 + %1(s32) = G_SEXT %0 + ; G_SEXT with s8 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s32) = G_SEXT {{%[0-9]+}} + %r0 = COPY %1(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_zext_s16 +# CHECK-LABEL: name: test_zext_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s16) = COPY %r0 + %1(s32) = G_ZEXT %0 + ; G_ZEXT with s16 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s32) = G_ZEXT {{%[0-9]+}} + %r0 = COPY %1(s32) + BX_RET 14, _, implicit %r0 ... --- name: test_add_s8 @@ -104,8 +162,179 @@ body: | ; This is legal, so we should find it unchanged in the output ; CHECK: [[FIVREG:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[FRAME_INDEX]] - ; CHECK: {{%[0-9]+}}(s32) = G_LOAD [[FIVREG]](p0) + ; CHECK: {{%[0-9]+}}(s32) = G_LOAD [[FIVREG]](p0) :: (load 4) %0(p0) = G_FRAME_INDEX %fixed-stack.2 - %1(s32) = G_LOAD %0(p0) + %1(s32) = G_LOAD %0(p0) :: (load 4) + BX_RET 14, _ +... +--- +name: test_legal_loads +# CHECK-LABEL: name: test_legal_loads +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + ; These are all legal, so we should find them unchanged in the output + ; CHECK-DAG: {{%[0-9]+}}(s64) = G_LOAD %0 + ; CHECK-DAG: {{%[0-9]+}}(s32) = G_LOAD %0 + ; CHECK-DAG: {{%[0-9]+}}(s16) = G_LOAD %0 + ; CHECK-DAG: {{%[0-9]+}}(s8) = G_LOAD %0 + ; CHECK-DAG: {{%[0-9]+}}(s1) = G_LOAD %0 + ; CHECK-DAG: {{%[0-9]+}}(p0) = G_LOAD %0 + %0(p0) = COPY %r0 + %1(s32) = G_LOAD %0(p0) :: (load 4) + %2(s16) = G_LOAD %0(p0) :: (load 2) + %3(s8) = G_LOAD %0(p0) :: (load 1) + %4(s1) = G_LOAD %0(p0) :: (load 1) + %5(p0) = G_LOAD %0(p0) :: (load 4) + %6(s64) = G_LOAD %0(p0) :: (load 8) + BX_RET 14, _ +... +--- +name: test_legal_stores +# CHECK-LABEL: name: test_legal_stores +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3, %r4, %r5, %r6, %d1 + + ; These are all legal, so we should find them unchanged in the output + ; CHECK-DAG: G_STORE {{%[0-9]+}}(s64), %0(p0) + ; CHECK-DAG: G_STORE {{%[0-9]+}}(s32), %0(p0) + ; CHECK-DAG: G_STORE {{%[0-9]+}}(s16), %0(p0) + ; CHECK-DAG: G_STORE {{%[0-9]+}}(s8), %0(p0) + ; CHECK-DAG: G_STORE {{%[0-9]+}}(s1), %0(p0) + ; CHECK-DAG: G_STORE {{%[0-9]+}}(p0), %0(p0) + %0(p0) = COPY %r0 + %1(s64) = COPY %d1 + G_STORE %1(s64), %0(p0) :: (store 8) + %2(s32) = COPY %r2 + G_STORE %2(s32), %0(p0) :: (store 4) + %3(s16) = COPY %r3 + G_STORE %3(s16), %0(p0) :: (store 2) + %4(s8) = COPY %r4 + G_STORE %4(s8), %0(p0) :: (store 1) + %5(s1) = COPY %r5 + G_STORE %5(s1), %0(p0) :: (store 1) + %6(p0) = COPY %r6 + G_STORE %6(p0), %0(p0) :: (store 4) BX_RET 14, _ ... +--- +name: test_gep +# CHECK-LABEL: name: test_gep +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(p0) = COPY %r0 + %1(s32) = COPY %r1 + + ; CHECK: {{%[0-9]+}}(p0) = G_GEP {{%[0-9]+}}, {{%[0-9]+}}(s32) + %2(p0) = G_GEP %0, %1(s32) + + %r0 = COPY %2(p0) + BX_RET 14, _, implicit %r0 +... +--- +name: test_constants +# CHECK-LABEL: name: test_constants +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } +body: | + bb.0: + %0(s32) = G_CONSTANT 42 + ; CHECK: {{%[0-9]+}}(s32) = G_CONSTANT 42 + + %r0 = COPY %0(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_fadd_s32 +# CHECK-LABEL: name: test_fadd_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = G_FADD %0, %1 + ; G_FADD with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s32) = G_FADD {{%[0-9]+, %[0-9]+}} + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_fadd_s64 +# CHECK-LABEL: name: test_fadd_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + %1(s64) = COPY %d1 + %2(s64) = G_FADD %0, %1 + ; G_FADD with s64 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s64) = G_FADD {{%[0-9]+, %[0-9]+}} + %d0 = COPY %2(s64) + BX_RET 14, _, implicit %d0 + +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index ce0601021e62..fbf8d81322f8 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -3,6 +3,23 @@ define void @test_add_s32() { ret void } define void @test_add_s16() { ret void } define void @test_add_s8() { ret void } + define void @test_add_s1() { ret void } + + define void @test_loads() #0 { ret void } + define void @test_stores() #0 { ret void } + + define void @test_stack() { ret void } + + define void @test_gep() { ret void } + + define void @test_constants() { ret void } + + define void @test_fadd_s32() #0 { ret void } + define void @test_fadd_s64() #0 { ret void } + + define void @test_soft_fp_s64() #0 { ret void } + + attributes #0 = { "target-features"="+vfp2"} ... --- name: test_add_s32 @@ -82,3 +99,266 @@ body: | BX_RET 14, _, implicit %r0 ... +--- +name: test_add_s1 +# CHECK-LABEL: name: test_add_s1 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s1) = COPY %r0 + %1(s1) = COPY %r1 + %2(s1) = G_ADD %0, %1 + %r0 = COPY %2(s1) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_loads +# CHECK-LABEL: name: test_loads +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } +# CHECK: - { id: 6, class: fprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } +body: | + bb.0: + liveins: %r0 + %0(p0) = COPY %r0 + %6(s64) = G_LOAD %0 :: (load 8) + %1(s32) = G_LOAD %0 :: (load 4) + %2(s16) = G_LOAD %0 :: (load 2) + %3(s8) = G_LOAD %0 :: (load 1) + %4(s1) = G_LOAD %0 :: (load 1) + %5(p0) = G_LOAD %0 :: (load 4) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_stores +# CHECK-LABEL: name: test_stores +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } +# CHECK: - { id: 6, class: fprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3, %r4, %r5, %d6 + %0(p0) = COPY %r0 + %1(s32) = COPY %r1 + G_STORE %1(s32), %0 :: (store 4) + %2(s16) = COPY %r2 + G_STORE %2(s16), %0 :: (store 2) + %3(s8) = COPY %r3 + G_STORE %3(s8), %0 :: (store 1) + %4(s1) = COPY %r4 + G_STORE %4(s1), %0 :: (store 1) + %5(p0) = COPY %r5 + G_STORE %5(p0), %0 :: (store 4) + %6(s64) = COPY %d6 + G_STORE %6(s64), %0 :: (store 8) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_stack +# CHECK-LABEL: name: test_stack +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } +fixedStack: + - { id: 0, offset: 0, size: 4, alignment: 4, isImmutable: true, isAliased: false } +body: | + bb.0: + %0(p0) = G_FRAME_INDEX %fixed-stack.0 + %1(s32) = G_LOAD %0(p0) :: (load 4 from %fixed-stack.0, align 0) + + %2(p0) = COPY %sp + %3(s32) = G_CONSTANT i32 8 + %4(p0) = G_GEP %2, %3(s32) + G_STORE %1(s32), %4(p0) :: (store 4) + + BX_RET 14, _ + +... +--- +name: test_gep +# CHECK-LABEL: name: test_gep +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(p0) = COPY %r0 + %1(s32) = COPY %r1 + %2(p0) = G_GEP %0, %1(s32) + %r0 = COPY %2(p0) + BX_RET 14, _, implicit %r0 +... +--- +name: test_constants +# CHECK-LABEL: name: test_constants +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +registers: + - { id: 0, class: _ } +body: | + bb.0: + %0(s32) = G_CONSTANT 42 + %r0 = COPY %0(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_fadd_s32 +# CHECK-LABEL: name: test_fadd_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: fprb } +# CHECK: - { id: 1, class: fprb } +# CHECK: - { id: 2, class: fprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + %1(s32) = COPY %s1 + %2(s32) = G_FADD %0, %1 + %s0 = COPY %2(s32) + BX_RET 14, _, implicit %s0 + +... +--- +name: test_fadd_s64 +# CHECK-LABEL: name: test_fadd_s64 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: fprb } +# CHECK: - { id: 1, class: fprb } +# CHECK: - { id: 2, class: fprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + %1(s64) = COPY %d1 + %2(s64) = G_FADD %0, %1 + %d0 = COPY %2(s64) + BX_RET 14, _, implicit %d0 + +... +--- +name: test_soft_fp_s64 +# CHECK-LABEL: name: test_soft_fp_s64 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: fprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 32 + %3(s32) = G_EXTRACT %2(s64), 0 + %4(s32) = G_EXTRACT %2(s64), 32 + %r0 = COPY %3(s32) + %r1 = COPY %4(s32) + BX_RET 14, _, implicit %r0, implicit %r1 + +... diff --git a/test/CodeGen/ARM/alloc-no-stack-realign.ll b/test/CodeGen/ARM/alloc-no-stack-realign.ll index 7d37c83d7483..0e077b3aee5a 100644 --- a/test/CodeGen/ARM/alloc-no-stack-realign.ll +++ b/test/CodeGen/ARM/alloc-no-stack-realign.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=NO-REALIGN -; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=REALIGN +; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s ; rdar://12713765 ; When realign-stack is set to false, make sure we are not creating stack @@ -8,29 +7,31 @@ define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" { entry: -; NO-REALIGN-LABEL: test1 -; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]] -; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! -; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 -; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48 -; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] - -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: mov r[[R3:[0-9]+]], r[[R1]] -; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]! -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128] - -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]! -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128] +; CHECK-LABEL: test1 +; CHECK: ldr r[[R1:[0-9]+]], [pc, r1] +; CHECK: add r[[R2:[0-9]+]], r1, #48 +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: mov r[[R2:[0-9]+]], r[[R1]] +; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32 +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: mov r[[R1:[0-9]+]], sp +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32 +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R1:[0-9]+]], r0, #48 +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: add r[[R1:[0-9]+]], r0, #32 +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]! +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128] %retval = alloca <16 x float>, align 16 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 store <16 x float> %0, <16 x float>* %retval @@ -41,32 +42,33 @@ entry: define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp { entry: -; REALIGN-LABEL: test2 -; REALIGN: bfc sp, #0, #6 -; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]] -; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! -; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 -; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48 -; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: ldr r[[R1:[0-9]+]], [pc, r1] +; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48 +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: mov r[[R2:[0-9]+]], r[[R1]] +; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32 +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: mov r[[R1:[0-9]+]], sp +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #32 +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]! +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R1:[0-9]+]], r0, #48 +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: add r[[R1:[0-9]+]], r0, #32 +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]! +; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128] -; REALIGN: orr r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #32 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #16 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] - -; REALIGN: add r[[R1:[0-9]+]], r[[R0:0]], #48 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]! -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128] - %retval = alloca <16 x float>, align 16 +%retval = alloca <16 x float>, align 16 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 store <16 x float> %0, <16 x float>* %retval %1 = load <16 x float>, <16 x float>* %retval diff --git a/test/CodeGen/ARM/arg-copy-elide.ll b/test/CodeGen/ARM/arg-copy-elide.ll new file mode 100644 index 000000000000..739b560b0833 --- /dev/null +++ b/test/CodeGen/ARM/arg-copy-elide.ll @@ -0,0 +1,61 @@ +; RUN: llc -mtriple=armv7-linux < %s | FileCheck %s + +declare arm_aapcscc void @addrof_i32(i32*) +declare arm_aapcscc void @addrof_i64(i64*) + +define arm_aapcscc void @simple(i32, i32, i32, i32, i32 %x) { +entry: + %x.addr = alloca i32 + store i32 %x, i32* %x.addr + call void @addrof_i32(i32* %x.addr) + ret void +} + +; CHECK-LABEL: simple: +; CHECK: push {r11, lr} +; CHECK: add r0, sp, #8 +; CHECK: bl addrof_i32 +; CHECK: pop {r11, pc} + + +; We need to load %x before calling addrof_i32 now because it could mutate %x in +; place. + +define arm_aapcscc i32 @use_arg(i32, i32, i32, i32, i32 %x) { +entry: + %x.addr = alloca i32 + store i32 %x, i32* %x.addr + call void @addrof_i32(i32* %x.addr) + ret i32 %x +} + +; CHECK-LABEL: use_arg: +; CHECK: push {[[csr:[^ ]*]], lr} +; CHECK: ldr [[csr]], [sp, #8] +; CHECK: add r0, sp, #8 +; CHECK: bl addrof_i32 +; CHECK: mov r0, [[csr]] +; CHECK: pop {[[csr]], pc} + + +define arm_aapcscc i64 @split_i64(i32, i32, i32, i32, i64 %x) { +entry: + %x.addr = alloca i64, align 4 + store i64 %x, i64* %x.addr, align 4 + call void @addrof_i64(i64* %x.addr) + ret i64 %x +} + +; CHECK-LABEL: split_i64: +; CHECK: push {r4, r5, r11, lr} +; CHECK: sub sp, sp, #8 +; CHECK: ldr r4, [sp, #28] +; CHECK: ldr r5, [sp, #24] +; CHECK: mov r0, sp +; CHECK: str r4, [sp, #4] +; CHECK: str r5, [sp] +; CHECK: bl addrof_i64 +; CHECK: mov r0, r5 +; CHECK: mov r1, r4 +; CHECK: add sp, sp, #8 +; CHECK: pop {r4, r5, r11, pc} diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll index 9bd2077e4d03..31691e9468c9 100644 --- a/test/CodeGen/ARM/arm-and-tst-peephole.ll +++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll @@ -1,7 +1,6 @@ ; RUN: llc -mtriple=arm-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=ARM %s ; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=THUMB %s -; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \ -; RUN: | FileCheck -check-prefix=T2 %s +; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck -check-prefix=T2 %s ; RUN: llc -mtriple=thumbv8-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=V8 %s ; FIXME: The -march=thumb test doesn't change if -disable-peephole is specified. @@ -49,9 +48,9 @@ tailrecurse.switch: ; preds = %tailrecurse ; V8-NEXT: beq ; V8-NEXT: %tailrecurse.switch ; V8: cmp -; V8-NEXT: bne -; V8-NEXT: b -; The trailing space in the last line checks that the branch is unconditional +; V8-NEXT: beq +; V8-NEXT: %sw.epilog +; V8-NEXT: bx lr switch i32 %and, label %sw.epilog [ i32 1, label %sw.bb i32 3, label %sw.bb6 @@ -93,7 +92,7 @@ entry: %1 = load i8, i8* %0, align 1 %2 = zext i8 %1 to i32 ; ARM: ands -; THUMB: ands +; THUMB: ands ; T2: ands ; V8: ands ; V8-NEXT: beq @@ -141,19 +140,48 @@ return: ; preds = %bb2, %bb, %entry ; folding of unrelated tests (in this case, a TST against r1 was eliminated in ; favour of an AND of r0). +define i32 @test_tst_assessment(i32 %a, i32 %b) { ; ARM-LABEL: test_tst_assessment: +; ARM: @ BB#0: +; ARM-NEXT: and r0, r0, #1 +; ARM-NEXT: tst r1, #1 +; ARM-NEXT: subne r0, r0, #1 +; ARM-NEXT: mov pc, lr +; ; THUMB-LABEL: test_tst_assessment: +; THUMB: @ BB#0: +; THUMB-NEXT: movs r2, r0 +; THUMB-NEXT: movs r0, #1 +; THUMB-NEXT: ands r0, r2 +; THUMB-NEXT: subs r2, r0, #1 +; THUMB-NEXT: lsls r1, r1, #31 +; THUMB-NEXT: beq .LBB2_2 +; THUMB-NEXT: @ BB#1: +; THUMB-NEXT: movs r0, r2 +; THUMB-NEXT: .LBB2_2: +; THUMB-NEXT: bx lr +; ; T2-LABEL: test_tst_assessment: +; T2: @ BB#0: +; T2-NEXT: lsls r1, r1, #31 +; T2-NEXT: and r0, r0, #1 +; T2-NEXT: it ne +; T2-NEXT: subne r0, #1 +; T2-NEXT: bx lr +; ; V8-LABEL: test_tst_assessment: -define i32 @test_tst_assessment(i1 %lhs, i1 %rhs) { - %lhs32 = zext i1 %lhs to i32 - %rhs32 = zext i1 %rhs to i32 - %diff = sub nsw i32 %lhs32, %rhs32 -; ARM: tst r1, #1 -; THUMB: lsls r1, r1, #31 -; T2: lsls r1, r1, #31 -; V8: lsls r1, r1, #31 - ret i32 %diff +; V8: @ BB#0: +; V8-NEXT: lsls r1, r1, #31 +; V8-NEXT: and r0, r0, #1 +; V8-NEXT: it ne +; V8-NEXT: subne r0, #1 +; V8-NEXT: bx lr + %and1 = and i32 %a, 1 + %sub = sub i32 %and1, 1 + %and2 = and i32 %b, 1 + %cmp = icmp eq i32 %and2, 0 + %sel = select i1 %cmp, i32 %and1, i32 %sub + ret i32 %sel } !1 = !{!"branch_weights", i32 1, i32 1, i32 3, i32 2 } diff --git a/test/CodeGen/ARM/arm-position-independence.ll b/test/CodeGen/ARM/arm-position-independence.ll index 02a63984ad6f..4aa817f7a481 100644 --- a/test/CodeGen/ARM/arm-position-independence.ll +++ b/test/CodeGen/ARM/arm-position-independence.ll @@ -13,6 +13,12 @@ ; RUN: llc -relocation-model=rwpi -mtriple=thumbv6m--none-eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1_RO_ABS --check-prefix=THUMB1_RW_SB ; RUN: llc -relocation-model=ropi-rwpi -mtriple=thumbv6m--none-eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1_RO_PC --check-prefix=THUMB1_RW_SB +; RUN: llc -relocation-model=rwpi -mtriple=armv7a--none-eabi -mattr=no-movt < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NO_MOVT_ARM_RO_ABS --check-prefix=NO_MOVT_ARM_RW_SB +; RUN: llc -relocation-model=ropi-rwpi -mtriple=armv7a--none-eabi -mattr=no-movt < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NO_MOVT_ARM_RO_PC --check-prefix=NO_MOVT_ARM_RW_SB + +; RUN: llc -relocation-model=rwpi -mtriple=thumbv7m--none-eabi -mattr=no-movt < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NO_MOVT_THUMB2_RO_ABS --check-prefix=NO_MOVT_THUMB2_RW_SB +; RUN: llc -relocation-model=ropi-rwpi -mtriple=thumbv7m--none-eabi -mattr=no-movt < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NO_MOVT_THUMB2_RO_PC --check-prefix=NO_MOVT_THUMB2_RW_SB + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" @a = external global i32, align 4 @@ -28,16 +34,24 @@ entry: ; ARM_RW_ABS: movt r[[REG]], :upper16:a ; ARM_RW_ABS: ldr r0, [r[[REG]]] -; ARM_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; ARM_RW_SB: movw r[[REG:[0-9]]], :lower16:a(sbrel) +; ARM_RW_SB: movt r[[REG]], :upper16:a(sbrel) ; ARM_RW_SB: ldr r0, [r9, r[[REG]]] +; NO_MOVT_ARM_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_ARM_RW_SB: ldr r0, [r9, r[[REG]]] + ; THUMB2_RW_ABS: movw r[[REG:[0-9]]], :lower16:a ; THUMB2_RW_ABS: movt r[[REG]], :upper16:a ; THUMB2_RW_ABS: ldr r0, [r[[REG]]] -; THUMB2_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; THUMB2_RW_SB: movw r[[REG:[0-9]]], :lower16:a(sbrel) +; THUMB2_RW_SB: movt r[[REG]], :upper16:a(sbrel) ; THUMB2_RW_SB: ldr.w r0, [r9, r[[REG]]] +; NO_MOVT_THUMB2_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_THUMB2_RW_SB: ldr.w r0, [r9, r[[REG]]] + ; THUMB1_RW_ABS: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] ; THUMB1_RW_ABS: ldr r0, [r[[REG]]] @@ -47,11 +61,11 @@ entry: ; CHECK: {{(bx lr|pop)}} -; ARM_RW_SB: [[LCPI]] -; ARM_RW_SB: .long a(sbrel) +; NO_MOVT_ARM_RW_SB: [[LCPI]] +; NO_MOVT_ARM_RW_SB: .long a(sbrel) -; THUMB2_RW_SB: [[LCPI]] -; THUMB2_RW_SB: .long a(sbrel) +; NO_MOVT_THUMB2_RW_SB: [[LCPI]] +; NO_MOVT_THUMB2_RW_SB: .long a(sbrel) ; THUMB1_RW_ABS: [[LCPI]] ; THUMB1_RW_ABS-NEXT: .long a @@ -70,16 +84,24 @@ entry: ; ARM_RW_ABS: movt r[[REG]], :upper16:a ; ARM_RW_ABS: str r0, [r[[REG:[0-9]]]] -; ARM_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] -; ARM_RW_SB: str r0, [r9, r[[REG]]] +; ARM_RW_SB: movw r[[REG:[0-9]]], :lower16:a +; ARM_RW_SB: movt r[[REG]], :upper16:a +; ARM_RW_SB: str r0, [r9, r[[REG:[0-9]]]] + +; NO_MOVT_ARM_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_ARM_RW_SB: str r0, [r9, r[[REG]]] ; THUMB2_RW_ABS: movw r[[REG:[0-9]]], :lower16:a ; THUMB2_RW_ABS: movt r[[REG]], :upper16:a ; THUMB2_RW_ABS: str r0, [r[[REG]]] -; THUMB2_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; THUMB2_RW_SB: movw r[[REG:[0-9]]], :lower16:a(sbrel) +; THUMB2_RW_SB: movt r[[REG]], :upper16:a(sbrel) ; THUMB2_RW_SB: str.w r0, [r9, r[[REG]]] +; NO_MOVT_THUMB2_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_THUMB2_RW_SB: str.w r0, [r9, r[[REG]]] + ; THUMB1_RW_ABS: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] ; THUMB1_RW_ABS: str r0, [r[[REG]]] @@ -89,11 +111,11 @@ entry: ; CHECK: {{(bx lr|pop)}} -; ARM_RW_SB: [[LCPI]] -; ARM_RW_SB: .long a(sbrel) +; NO_MOVT_ARM_RW_SB: [[LCPI]] +; NO_MOVT_ARM_RW_SB: .long a(sbrel) -; THUMB2_RW_SB: [[LCPI]] -; THUMB2_RW_SB: .long a(sbrel) +; NO_MOVT_THUMB2_RW_SB: [[LCPI]] +; NO_MOVT_THUMB2_RW_SB: .long a(sbrel) ; THUMB1_RW_ABS: [[LCPI]] ; THUMB1_RW_ABS-NEXT: .long a @@ -112,21 +134,37 @@ entry: ; ARM_RO_ABS: movt r[[reg]], :upper16:b ; ARM_RO_ABS: ldr r0, [r[[reg]]] +; NO_MOVT_ARM_RO_ABS: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_ARM_RO_ABS: ldr r0, [r[[REG]]] + ; ARM_RO_PC: movw r[[REG:[0-9]]], :lower16:(b-([[LPC:.LPC[0-9]+_[0-9]+]]+8)) ; ARM_RO_PC: movt r[[REG]], :upper16:(b-([[LPC]]+8)) ; ARM_RO_PC: [[LPC]]: ; ARM_RO_PC-NEXT: ldr r0, [pc, r[[REG]]] +; NO_MOVT_ARM_RO_PC: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_ARM_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]: +; NO_MOVT_ARM_RO_PC: ldr r0, [pc, r[[REG]]] + ; THUMB2_RO_ABS: movw r[[REG:[0-9]]], :lower16:b ; THUMB2_RO_ABS: movt r[[REG]], :upper16:b ; THUMB2_RO_ABS: ldr r0, [r[[REG]]] +; NO_MOVT_THUMB2_RO_ABS: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_THUMB2_RO_ABS: ldr r0, [r[[REG]]] + ; THUMB2_RO_PC: movw r[[REG:[0-9]]], :lower16:(b-([[LPC:.LPC[0-9]+_[0-9]+]]+4)) ; THUMB2_RO_PC: movt r[[REG]], :upper16:(b-([[LPC]]+4)) ; THUMB2_RO_PC: [[LPC]]: ; THUMB2_RO_PC-NEXT: add r[[REG]], pc ; THUMB2_RO_PC: ldr r0, [r[[REG]]] +; NO_MOVT_THUMB2_RO_PC: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_THUMB2_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]: +; NO_MOVT_THUMB2_RO_PC-NEXT: add r[[REG]], pc +; NO_MOVT_THUMB2_RO_PC: ldr r0, [r[[REG]]] + + ; THUMB1_RO_ABS: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] ; THUMB1_RO_ABS: ldr r0, [r[[REG]]] @@ -137,9 +175,21 @@ entry: ; CHECK: {{(bx lr|pop)}} +; NO_MOVT_ARM_RO_ABS: [[LCPI]] +; NO_MOVT_ARM_RO_ABS-NEXT: .long b + +; NO_MOVT_THUMB2_RO_ABS: [[LCPI]] +; NO_MOVT_THUMB2_RO_ABS-NEXT: .long b + ; THUMB1_RO_ABS: [[LCPI]] ; THUMB1_RO_ABS-NEXT: .long b +; NO_MOVT_ARM_RO_PC: [[LCPI]] +; NO_MOVT_ARM_RO_PC-NEXT: .long b-([[LPC]]+8) + +; NO_MOVT_THUMB2_RO_PC: [[LCPI]] +; NO_MOVT_THUMB2_RO_PC-NEXT: .long b-([[LPC]]+4) + ; THUMB1_RO_PC: [[LCPI]] ; THUMB1_RO_PC-NEXT: .long b-([[LPC]]+4) } @@ -152,15 +202,23 @@ entry: ; ARM_RW_ABS: movw r[[REG:[0-9]]], :lower16:a ; ARM_RW_ABS: movt r[[REG]], :upper16:a -; ARM_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; ARM_RW_SB: movw r[[REG:[0-9]]], :lower16:a(sbrel) +; ARM_RW_SB: movt r[[REG]], :upper16:a(sbrel) ; ARM_RW_SB: add r0, r9, r[[REG]] +; NO_MOVT_ARM_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_ARM_RW_SB: add r0, r9, r[[REG]] + ; THUMB2_RW_ABS: movw r[[REG:[0-9]]], :lower16:a ; THUMB2_RW_ABS: movt r[[REG]], :upper16:a -; THUMB2_RW_SB: ldr r0, [[LCPI:.LCPI[0-9]+_[0-9]+]] +; THUMB2_RW_SB: movw r[[REG:[0-9]]], :lower16:a(sbrel) +; THUMB2_RW_SB: movt r[[REG]], :upper16:a(sbrel) ; THUMB2_RW_SB: add r0, r9 +; NO_MOVT_THUMB2_RW_SB: ldr r0, [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_THUMB2_RW_SB: add r0, r9 + ; THUMB1_RW_ABS: ldr r0, [[LCPI:.LCPI[0-9]+_[0-9]+]] ; THUMB1_RW_SB: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] @@ -169,11 +227,11 @@ entry: ; CHECK: {{(bx lr|pop)}} -; ARM_RW_SB: [[LCPI]] -; ARM_RW_SB: .long a(sbrel) +; NO_MOVT_ARM_RW_SB: [[LCPI]] +; NO_MOVT_ARM_RW_SB: .long a(sbrel) -; THUMB2_RW_SB: [[LCPI]] -; THUMB2_RW_SB: .long a(sbrel) +; NO_MOVT_THUMB2_RW_SB: [[LCPI]] +; NO_MOVT_THUMB2_RW_SB: .long a(sbrel) ; THUMB1_RW_ABS: [[LCPI]] ; THUMB1_RW_ABS-NEXT: .long a @@ -190,19 +248,31 @@ entry: ; ARM_RO_ABS: movw r[[REG:[0-9]]], :lower16:b ; ARM_RO_ABS: movt r[[REG]], :upper16:b +; NO_MOVT_ARM_RO_ABS: ldr r0, [[LCPI:.LCPI[0-9]+_[0-9]+]] + ; ARM_RO_PC: movw r[[REG:[0-9]]], :lower16:(b-([[LPC:.LPC[0-9]+_[0-9]+]]+8)) ; ARM_RO_PC: movt r[[REG]], :upper16:(b-([[LPC]]+8)) ; ARM_RO_PC: [[LPC]]: ; ARM_RO_PC-NEXT: add r0, pc, r[[REG:[0-9]]] +; NO_MOVT_ARM_RO_PC: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_ARM_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]: +; NO_MOVT_ARM_RO_PC-NEXT: add r0, pc, r[[REG]] + ; THUMB2_RO_ABS: movw r[[REG:[0-9]]], :lower16:b ; THUMB2_RO_ABS: movt r[[REG]], :upper16:b +; NO_MOVT_THUMB2_RO_ABS: ldr r0, [[LCPI:.LCPI[0-9]+_[0-9]+]] + ; THUMB2_RO_PC: movw r0, :lower16:(b-([[LPC:.LPC[0-9]+_[0-9]+]]+4)) ; THUMB2_RO_PC: movt r0, :upper16:(b-([[LPC]]+4)) ; THUMB2_RO_PC: [[LPC]]: ; THUMB2_RO_PC-NEXT: add r0, pc +; NO_MOVT_THUMB2_RO_PC: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_THUMB2_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]: +; NO_MOVT_THUMB2_RO_PC-NEXT: add r[[REG]], pc + ; THUMB1_RO_ABS: ldr r0, [[LCPI:.LCPI[0-9]+_[0-9]+]] ; THUMB1_RO_PC: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] @@ -211,9 +281,21 @@ entry: ; CHECK: {{(bx lr|pop)}} +; NO_MOVT_ARM_RO_ABS: [[LCPI]] +; NO_MOVT_ARM_RO_ABS-NEXT: .long b + +; NO_MOVT_THUMB2_RO_ABS: [[LCPI]] +; NO_MOVT_THUMB2_RO_ABS-NEXT: .long b + ; THUMB1_RO_ABS: [[LCPI]] ; THUMB1_RO_ABS-NEXT: .long b +; NO_MOVT_ARM_RO_PC: [[LCPI]] +; NO_MOVT_ARM_RO_PC-NEXT: .long b-([[LPC]]+8) + +; NO_MOVT_THUMB2_RO_PC: [[LCPI]] +; NO_MOVT_THUMB2_RO_PC-NEXT: .long b-([[LPC]]+4) + ; THUMB1_RO_PC: [[LCPI]] ; THUMB1_RO_PC-NEXT: .long b-([[LPC]]+4) } @@ -226,19 +308,31 @@ entry: ; ARM_RO_ABS: movw r[[REG:[0-9]]], :lower16:take_addr_func ; ARM_RO_ABS: movt r[[REG]], :upper16:take_addr_func +; NO_MOVT_ARM_RO_ABS: ldr r0, [[LCPI:.LCPI[0-9]+_[0-9]+]] + ; ARM_RO_PC: movw r[[REG:[0-9]]], :lower16:(take_addr_func-([[LPC:.LPC[0-9]+_[0-9]+]]+8)) ; ARM_RO_PC: movt r[[REG]], :upper16:(take_addr_func-([[LPC]]+8)) ; ARM_RO_PC: [[LPC]]: ; ARM_RO_PC-NEXT: add r0, pc, r[[REG:[0-9]]] +; NO_MOVT_ARM_RO_PC: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_ARM_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]: +; NO_MOVT_ARM_RO_PC-NEXT: add r0, pc, r[[REG]] + ; THUMB2_RO_ABS: movw r[[REG:[0-9]]], :lower16:take_addr_func ; THUMB2_RO_ABS: movt r[[REG]], :upper16:take_addr_func +; NO_MOVT_THUMB2_RO_ABS: ldr r0, [[LCPI:.LCPI[0-9]+_[0-9]+]] + ; THUMB2_RO_PC: movw r0, :lower16:(take_addr_func-([[LPC:.LPC[0-9]+_[0-9]+]]+4)) ; THUMB2_RO_PC: movt r0, :upper16:(take_addr_func-([[LPC]]+4)) ; THUMB2_RO_PC: [[LPC]]: ; THUMB2_RO_PC-NEXT: add r0, pc +; NO_MOVT_THUMB2_RO_PC: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] +; NO_MOVT_THUMB2_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]: +; NO_MOVT_THUMB2_RO_PC-NEXT: add r[[REG]], pc + ; THUMB1_RO_ABS: ldr r0, [[LCPI:.LCPI[0-9]+_[0-9]+]] ; THUMB1_RO_PC: ldr r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]] @@ -247,9 +341,21 @@ entry: ; CHECK: {{(bx lr|pop)}} +; NO_MOVT_ARM_RO_ABS: [[LCPI]] +; NO_MOVT_ARM_RO_ABS-NEXT: .long take_addr_func + +; NO_MOVT_THUMB2_RO_ABS: [[LCPI]] +; NO_MOVT_THUMB2_RO_ABS-NEXT: .long take_addr_func + ; THUMB1_RO_ABS: [[LCPI]] ; THUMB1_RO_ABS-NEXT: .long take_addr_func +; NO_MOVT_ARM_RO_PC: [[LCPI]] +; NO_MOVT_ARM_RO_PC-NEXT: .long take_addr_func-([[LPC]]+8) + +; NO_MOVT_THUMB2_RO_PC: [[LCPI]] +; NO_MOVT_THUMB2_RO_PC-NEXT: .long take_addr_func-([[LPC]]+4) + ; THUMB1_RO_PC: [[LCPI]] ; THUMB1_RO_PC-NEXT: .long take_addr_func-([[LPC]]+4) } diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll index 364bd5d13691..e026bae361e1 100644 --- a/test/CodeGen/ARM/atomic-cmpxchg.ll +++ b/test/CodeGen/ARM/atomic-cmpxchg.ll @@ -24,14 +24,12 @@ entry: ; CHECK-THUMB-LABEL: test_cmpxchg_res_i8 ; CHECK-THUMB: bl __sync_val_compare_and_swap_1 ; CHECK-THUMB-NOT: mov [[R1:r[0-7]]], r0 -; CHECK-THUMB: push {r0} -; CHECK-THUMB: pop {[[R1:r[0-7]]]} +; CHECK-THUMB: movs [[R1:r[0-7]]], r0 ; CHECK-THUMB: movs r0, #1 ; CHECK-THUMB: movs [[R2:r[0-9]+]], #0 ; CHECK-THUMB: cmp [[R1]], {{r[0-9]+}} ; CHECK-THUMB: beq -; CHECK-THUMB: push {[[R2]]} -; CHECK-THUMB: pop {r0} +; CHECK-THUMB: movs r0, [[R2]] ; CHECK-ARMV6-LABEL: test_cmpxchg_res_i8: ; CHECK-ARMV6-NEXT: .fnstart @@ -66,14 +64,14 @@ entry: ; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]: ; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0] ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0 -; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1 +; CHECK-ARMV7-NEXT: moveq r0, #1 ; CHECK-ARMV7-NEXT: bxeq lr ; CHECK-ARMV7-NEXT: [[TRY]]: -; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0] -; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]] +; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0] +; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1 ; CHECK-ARMV7-NEXT: beq [[HEAD]] ; CHECK-ARMV7-NEXT: clrex -; CHECK-ARMV7-NEXT: mov [[RES]], #0 +; CHECK-ARMV7-NEXT: mov r0, #0 ; CHECK-ARMV7-NEXT: bx lr ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8: diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll index e6a4949d53ce..23c4ccea4604 100644 --- a/test/CodeGen/ARM/atomic-op.ll +++ b/test/CodeGen/ARM/atomic-op.ll @@ -320,10 +320,10 @@ define i32 @test_cmpxchg_fail_order1(i32 *%addr, i32 %desired, i32 %new) { ; CHECK: strex [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]] ; CHECK: cmp [[SUCCESS]], #0 ; CHECK: bne [[LOOP_BB]] -; CHECK: b [[END_BB:\.?LBB[0-9]+_[0-9]+]] +; CHECK: dmb ish +; CHECK: bx lr ; CHECK: [[FAIL_BB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[END_BB]]: ; CHECK: dmb ish ; CHECK: bx lr diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll index 77b850bd617b..d1575ed12e4e 100644 --- a/test/CodeGen/ARM/atomic-ops-v8.ll +++ b/test/CodeGen/ARM/atomic-ops-v8.ll @@ -1045,20 +1045,21 @@ define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i8 %old } @@ -1078,20 +1079,21 @@ define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounw ; function there. ; CHECK-ARM-NEXT: cmp r[[OLD]], r0 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK-ARM: mov r0, r[[OLD]] +; CHECK: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK-ARM: mov r0, r[[OLD]] +; CHECK-ARM-NEXT: bx lr ret i16 %old } @@ -1110,20 +1112,21 @@ define void @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind { ; r0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: cmp r[[OLD]], r0 -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r1 is a reasonable guess. ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-NEXT: bx lr ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr ; CHECK: str{{(.w)?}} r[[OLD]], +; CHECK-ARM-NEXT: bx lr ret void } @@ -1148,16 +1151,16 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]] ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]] -; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 +; CHECK-NEXT: bne .LBB{{[0-9]+}}_4 ; CHECK-NEXT: BB#2: ; As above, r2, r3 is a reasonable guess. ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]] ; CHECK-NEXT: cmp [[STATUS]], #0 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1 -; CHECK-NEXT: b .LBB{{[0-9]+}}_4 -; CHECK-NEXT: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: clrex +; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]] +; CHECK-NEXT: pop ; CHECK-NEXT: .LBB{{[0-9]+}}_4: +; CHECK-NEXT: clrex ; CHECK-NOT: dmb ; CHECK-NOT: mcr diff --git a/test/CodeGen/ARM/bfi.ll b/test/CodeGen/ARM/bfi.ll index 893fef3add7e..31eff16fcc3c 100644 --- a/test/CodeGen/ARM/bfi.ll +++ b/test/CodeGen/ARM/bfi.ll @@ -77,7 +77,7 @@ entry: define i32 @f7(i32 %x, i32 %y) { ; CHECK-LABEL: f7: -; CHECK: bfi r1, r0, #4, #1 +; CHECK: bfi r0, r2, #4, #1 %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00 %and = and i32 %x, 4 %or = or i32 %y2, 16 @@ -88,8 +88,8 @@ define i32 @f7(i32 %x, i32 %y) { define i32 @f8(i32 %x, i32 %y) { ; CHECK-LABEL: f8: -; CHECK: bfi r1, r0, #4, #1 -; CHECK: bfi r1, r0, #5, #1 +; CHECK: bfi r0, r2, #4, #1 +; CHECK: bfi r0, r2, #5, #1 %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00 %and = and i32 %x, 4 %or = or i32 %y2, 48 @@ -111,7 +111,7 @@ define i32 @f9(i32 %x, i32 %y) { define i32 @f10(i32 %x, i32 %y) { ; CHECK-LABEL: f10: -; CHECK: bfi r1, r0, #4, #2 +; CHECK: bfi r0, r2, #4, #2 %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00 %and = and i32 %x, 4 %or = or i32 %y2, 32 @@ -128,7 +128,7 @@ define i32 @f10(i32 %x, i32 %y) { define i32 @f11(i32 %x, i32 %y) { ; CHECK-LABEL: f11: -; CHECK: bfi r1, r0, #4, #3 +; CHECK: bfi r0, r2, #4, #3 %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00 %and = and i32 %x, 4 %or = or i32 %y2, 32 @@ -150,7 +150,7 @@ define i32 @f11(i32 %x, i32 %y) { define i32 @f12(i32 %x, i32 %y) { ; CHECK-LABEL: f12: -; CHECK: bfi r1, r0, #4, #1 +; CHECK: bfi r0, r2, #4, #1 %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00 %and = and i32 %x, 4 %or = or i32 %y2, 16 diff --git a/test/CodeGen/ARM/bic.ll b/test/CodeGen/ARM/bic.ll index 691f8be4ab66..8be59898bd0f 100644 --- a/test/CodeGen/ARM/bic.ll +++ b/test/CodeGen/ARM/bic.ll @@ -1,17 +1,24 @@ ; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s define i32 @f1(i32 %a, i32 %b) { +; CHECK-LABEL: f1: +; CHECK: bic r0, r0, r1 %tmp = xor i32 %b, 4294967295 %tmp1 = and i32 %a, %tmp ret i32 %tmp1 } -; CHECK: bic r0, r0, r1 - define i32 @f2(i32 %a, i32 %b) { +; CHECK-LABEL: f2: +; CHECK: bic r0, r0, r1 %tmp = xor i32 %b, 4294967295 %tmp1 = and i32 %tmp, %a ret i32 %tmp1 } -; CHECK: bic r0, r0, r1 +define i32 @f3(i32 %a) { +; CHECK-LABEL: f3: +; CHECK: bic r0, r0, #255 + %tmp = and i32 %a, -256 + ret i32 %tmp +} diff --git a/test/CodeGen/ARM/bool-ext-inc.ll b/test/CodeGen/ARM/bool-ext-inc.ll new file mode 100644 index 000000000000..fe43f1b2ef93 --- /dev/null +++ b/test/CodeGen/ARM/bool-ext-inc.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-eabi -mattr=neon | FileCheck %s + +define i32 @sext_inc(i1 zeroext %x) { +; CHECK-LABEL: sext_inc: +; CHECK: @ BB#0: +; CHECK-NEXT: rsb r0, r0, #1 +; CHECK-NEXT: mov pc, lr + %ext = sext i1 %x to i32 + %add = add i32 %ext, 1 + ret i32 %add +} + +define <4 x i32> @sext_inc_vec(<4 x i1> %x) { +; CHECK-LABEL: sext_inc_vec: +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.i32 q9, #0x1f +; CHECK-NEXT: vmov.i32 q10, #0x1 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vneg.s32 q9, q9 +; CHECK-NEXT: vshl.i32 q8, q8, #31 +; CHECK-NEXT: vshl.s32 q8, q8, q9 +; CHECK-NEXT: vadd.i32 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %ext = sext <4 x i1> %x to <4 x i32> + %add = add <4 x i32> %ext, + ret <4 x i32> %add +} + diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll index b1b3b46dce24..fc85a3a2e683 100644 --- a/test/CodeGen/ARM/build-attributes.ll +++ b/test/CodeGen/ARM/build-attributes.ll @@ -102,6 +102,10 @@ ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=+fp-only-sp -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-FAST ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7-DOUBLE ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING +; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m23 | FileCheck %s --check-prefix=CORTEX-M23 +; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=CORTEX-M33 +; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M33-FAST +; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4 | FileCheck %s --check-prefix=CORTEX-R4 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4f | FileCheck %s --check-prefix=CORTEX-R4F ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5 @@ -182,6 +186,8 @@ ; ARMv7a ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s --check-prefix=NO-STRICT-ALIGN ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN +; ARMv7ve +; RUN: llc < %s -mtriple=armv7ve-none-linux-gnueabi | FileCheck %s --check-prefix=V7VE ; ARMv7r ; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=NO-STRICT-ALIGN ; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN @@ -210,6 +216,12 @@ ; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-r52 -mattr=-neon,+fp-only-sp,+d16 | FileCheck %s --check-prefix=ARMv8R --check-prefix=ARMv8R-SP ; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-r52 | FileCheck %s --check-prefix=ARMv8R --check-prefix=ARMv8R-NEON +; ARMv8-M +; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m23 | FileCheck %s --check-prefix=NO-STRICT-ALIGN +; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m23 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN +; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=NO-STRICT-ALIGN +; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m33 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN + ; XSCALE: .eabi_attribute 6, 5 ; XSCALE: .eabi_attribute 8, 1 ; XSCALE: .eabi_attribute 9, 1 @@ -369,6 +381,22 @@ ; V7-FAST-NOT: .eabi_attribute 22 ; V7-FAST: .eabi_attribute 23, 1 +; V7VE: .syntax unified +; V7VE: .eabi_attribute 6, 10 @ Tag_CPU_arch +; V7VE: .eabi_attribute 7, 65 @ Tag_CPU_arch_profile +; V7VE: .eabi_attribute 8, 1 @ Tag_ARM_ISA_use +; V7VE: .eabi_attribute 9, 2 @ Tag_THUMB_ISA_use +; V7VE: .eabi_attribute 17, 1 @ Tag_ABI_PCS_GOT_use +; V7VE: .eabi_attribute 20, 1 @ Tag_ABI_FP_denormal +; V7VE: .eabi_attribute 21, 1 @ Tag_ABI_FP_exceptions +; V7VE: .eabi_attribute 23, 3 @ Tag_ABI_FP_number_model +; V7VE: .eabi_attribute 24, 1 @ Tag_ABI_align_needed +; V7VE: .eabi_attribute 25, 1 @ Tag_ABI_align_preserved +; V7VE: .eabi_attribute 38, 1 @ Tag_ABI_FP_16bit_format +; V7VE: .eabi_attribute 42, 1 @ Tag_MPextension_use +; V7VE: .eabi_attribute 44, 2 @ Tag_DIV_use +; V7VE: .eabi_attribute 68, 3 @ Tag_Virtualization_use + ; V8: .syntax unified ; V8: .eabi_attribute 67, "2.09" ; V8: .eabi_attribute 6, 14 @@ -1310,6 +1338,55 @@ ; CORTEX-A32-FAST-NOT: .eabi_attribute 22 ; CORTEX-A32-FAST: .eabi_attribute 23, 1 +; CORTEX-M23: .cpu cortex-m23 +; CORTEX-M23: .eabi_attribute 6, 16 +; CORTEX-M23: .eabi_attribute 7, 77 +; CORTEX-M23: .eabi_attribute 8, 0 +; CORTEX-M23: .eabi_attribute 9, 3 +; CORTEX-M23: .eabi_attribute 17, 1 +;; We default to IEEE 754 compliance +; CORTEX-M23-NOT: .eabi_attribute 19 +; CORTEX-M23: .eabi_attribute 20, 1 +; CORTEX-M23: .eabi_attribute 21, 1 +; CORTEX-M23: .eabi_attribute 23, 3 +; CORTEX-M23: .eabi_attribute 34, 1 +; CORTEX-M23: .eabi_attribute 24, 1 +; CORTEX-M23-NOT: .eabi_attribute 27 +; CORTEX-M23-NOT: .eabi_attribute 28 +; CORTEX-M23: .eabi_attribute 25, 1 +; CORTEX-M23: .eabi_attribute 38, 1 +; CORTEX-M23: .eabi_attribute 14, 0 +; CORTEX-M23-NOT: .eabi_attribute 44 + +; CORTEX-M33: .cpu cortex-m33 +; CORTEX-M33: .eabi_attribute 6, 17 +; CORTEX-M33: .eabi_attribute 7, 77 +; CORTEX-M33: .eabi_attribute 8, 0 +; CORTEX-M33: .eabi_attribute 9, 3 +; CORTEX-M33: .fpu fpv5-sp-d16 +; CORTEX-M33: .eabi_attribute 17, 1 +;; We default to IEEE 754 compliance +; CORTEX-M23-NOT: .eabi_attribute 19 +; CORTEX-M33: .eabi_attribute 20, 1 +; CORTEX-M33: .eabi_attribute 21, 1 +; CORTEX-M33: .eabi_attribute 23, 3 +; CORTEX-M33: .eabi_attribute 34, 1 +; CORTEX-M33: .eabi_attribute 24, 1 +; CORTEX-M33: .eabi_attribute 25, 1 +; CORTEX-M33: .eabi_attribute 27, 1 +; CORTEX-M33-NOT: .eabi_attribute 28 +; CORTEX-M33: .eabi_attribute 36, 1 +; CORTEX-M33: .eabi_attribute 38, 1 +; CORTEX-M33: .eabi_attribute 46, 1 +; CORTEX-M33-NOT: .eabi_attribute 44 +; CORTEX-M33: .eabi_attribute 14, 0 + +; CORTEX-M33-FAST-NOT: .eabi_attribute 19 +; CORTEX-M33-FAST: .eabi_attribute 20, 2 +; CORTEX-M33-FAST-NOT: .eabi_attribute 21 +; CORTEX-M33-FAST-NOT: .eabi_attribute 22 +; CORTEX-M33-FAST: .eabi_attribute 23, 1 + ; CORTEX-A35: .cpu cortex-a35 ; CORTEX-A35: .eabi_attribute 6, 14 ; CORTEX-A35: .eabi_attribute 7, 65 diff --git a/test/CodeGen/ARM/cmp1-peephole-thumb.mir b/test/CodeGen/ARM/cmp1-peephole-thumb.mir new file mode 100644 index 000000000000..5ace58fd0658 --- /dev/null +++ b/test/CodeGen/ARM/cmp1-peephole-thumb.mir @@ -0,0 +1,78 @@ +# RUN: llc -run-pass=peephole-opt %s -o - | FileCheck %s + +--- | + ; ModuleID = '' + source_filename = "" + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumb-none--eabi" + + define i32 @f(i32 %a, i32 %b) { + entry: + %mul = mul nsw i32 %b, %a + %cmp = icmp eq i32 %mul, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv + } + +... +--- +name: f +# CHECK-LABEL: name: f +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr } + - { id: 1, class: tgpr } + - { id: 2, class: tgpr } + - { id: 3, class: tgpr } + - { id: 4, class: tgpr } + - { id: 5, class: tgpr } +liveins: + - { reg: '%r0', virtual-reg: '%0' } + - { reg: '%r1', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + +# CHECK: tMOVi8 1, 14, _ +# CHECK: tMOVi8 0, 14, _ +# CHECK: tMUL %1, %0, 14, _ +# CHECK-NOT: tCMPi8 +body: | + bb.0.entry: + successors: %bb.1.entry(0x40000000), %bb.2.entry(0x40000000) + liveins: %r0, %r1 + + %1 = COPY %r1 + %0 = COPY %r0 + %2, %cpsr = tMUL %1, %0, 14, _ + %3, %cpsr = tMOVi8 1, 14, _ + %4, %cpsr = tMOVi8 0, 14, _ + tCMPi8 killed %2, 0, 14, _, implicit-def %cpsr + tBcc %bb.2.entry, 0, %cpsr + + bb.1.entry: + successors: %bb.2.entry(0x80000000) + + + bb.2.entry: + %5 = PHI %4, %bb.1.entry, %3, %bb.0.entry + %r0 = COPY %5 + tBX_RET 14, _, implicit %r0 + +... diff --git a/test/CodeGen/ARM/cmp2-peephole-thumb.mir b/test/CodeGen/ARM/cmp2-peephole-thumb.mir new file mode 100644 index 000000000000..6e9ca70f1741 --- /dev/null +++ b/test/CodeGen/ARM/cmp2-peephole-thumb.mir @@ -0,0 +1,108 @@ +# RUN: llc -run-pass=peephole-opt %s -o - | FileCheck %s + +# Here we check that the peephole cmp rewrite is not triggered, because +# there is store instruction between the tMUL and tCMP, i.e. there are +# no constants to reorder. + +--- | + ; ModuleID = 'cmp2-peephole-thumb.ll' + source_filename = "" + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumb-none--eabi" + + define i32 @g(i32 %a, i32 %b) { + entry: + %retval = alloca i32, align 4 + %mul = alloca i32, align 4 + %mul1 = mul nsw i32 %a, %b + store i32 %mul1, i32* %mul, align 4 + %0 = load i32, i32* %mul, align 4 + %cmp = icmp sle i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + + if.then: ; preds = %entry + store i32 42, i32* %retval, align 4 + br label %return + + if.end: ; preds = %entry + store i32 1, i32* %retval, align 4 + br label %return + + return: ; preds = %if.end, %if.then + %1 = load i32, i32* %retval, align 4 + ret i32 %1 + } + +... +--- +name: g +# CHECK-LABEL: name: g +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: tgpr } + - { id: 1, class: tgpr } + - { id: 2, class: tgpr } + - { id: 3, class: tgpr } + - { id: 4, class: tgpr } + - { id: 5, class: tgpr } +liveins: + - { reg: '%r0', virtual-reg: '%0' } + - { reg: '%r1', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +stack: + - { id: 0, name: retval, offset: 0, size: 4, alignment: 4, local-offset: -4 } + - { id: 1, name: mul, offset: 0, size: 4, alignment: 4, local-offset: -8 } + +# CHECK: tMUL +# CHECK-NEXT: tSTRspi +# CHECK-NEXT: tCMPi8 +body: | + bb.0.entry: + successors: %bb.1.if.then(0x40000000), %bb.2.if.end(0x40000000) + liveins: %r0, %r1 + + %1 = COPY %r1 + %0 = COPY %r0 + %2, %cpsr = tMUL %0, %1, 14, _ + tSTRspi %2, %stack.1.mul, 0, 14, _ :: (store 4 into %ir.mul) + tCMPi8 %2, 0, 14, _, implicit-def %cpsr + tBcc %bb.2.if.end, 12, %cpsr + tB %bb.1.if.then, 14, _ + + bb.1.if.then: + successors: %bb.3.return(0x80000000) + + %4, %cpsr = tMOVi8 42, 14, _ + tSTRspi killed %4, %stack.0.retval, 0, 14, _ :: (store 4 into %ir.retval) + tB %bb.3.return, 14, _ + + bb.2.if.end: + successors: %bb.3.return(0x80000000) + + %3, %cpsr = tMOVi8 1, 14, _ + tSTRspi killed %3, %stack.0.retval, 0, 14, _ :: (store 4 into %ir.retval) + + bb.3.return: + %5 = tLDRspi %stack.0.retval, 0, 14, _ :: (dereferenceable load 4 from %ir.retval) + %r0 = COPY %5 + tBX_RET 14, _, implicit %r0 + +... diff --git a/test/CodeGen/ARM/cmpxchg-weak.ll b/test/CodeGen/ARM/cmpxchg-weak.ll index 4038528c91bc..0d5681aafbcb 100644 --- a/test/CodeGen/ARM/cmpxchg-weak.ll +++ b/test/CodeGen/ARM/cmpxchg-weak.ll @@ -13,14 +13,16 @@ define void @test_cmpxchg_weak(i32 *%addr, i32 %desired, i32 %new) { ; CHECK-NEXT: dmb ish ; CHECK-NEXT: strex [[SUCCESS:r[0-9]+]], r2, [r0] ; CHECK-NEXT: cmp [[SUCCESS]], #0 -; CHECK-NEXT: bne [[FAILBB:LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: beq [[SUCCESSBB:LBB[0-9]+_[0-9]+]] ; CHECK-NEXT: BB#2: -; CHECK-NEXT: dmb ish ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: bx lr ; CHECK-NEXT: [[LDFAILBB]]: ; CHECK-NEXT: clrex -; CHECK-NEXT: [[FAILBB]]: +; CHECK-NEXT: str r3, [r0] +; CHECK-NEXT: bx lr +; CHECK-NEXT: [[SUCCESSBB]]: +; CHECK-NEXT: dmb ish ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: bx lr diff --git a/test/CodeGen/ARM/constantpool-promote.ll b/test/CodeGen/ARM/constantpool-promote.ll index fb1bdfd62fb7..8df7e100c051 100644 --- a/test/CodeGen/ARM/constantpool-promote.ll +++ b/test/CodeGen/ARM/constantpool-promote.ll @@ -1,10 +1,15 @@ -; RUN: llc -relocation-model=static < %s | FileCheck %s -; RUN: llc -relocation-model=pic < %s | FileCheck %s -; RUN: llc -relocation-model=ropi < %s | FileCheck %s -; RUN: llc -relocation-model=rwpi < %s | FileCheck %s - -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" -target triple = "armv7--linux-gnueabihf" +; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM +; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM +; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM +; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM +; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB +; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB +; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB +; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB +; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M +; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M +; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M +; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M @.str = private unnamed_addr constant [2 x i8] c"s\00", align 1 @.str1 = private unnamed_addr constant [69 x i8] c"this string is far too long to fit in a literal pool by far and away\00", align 1 @@ -16,6 +21,7 @@ target triple = "armv7--linux-gnueabihf" @.arr3 = private unnamed_addr constant [2 x i16*] [i16* null, i16* null], align 4 @.ptr = private unnamed_addr constant [2 x i16*] [i16* getelementptr inbounds ([2 x i16], [2 x i16]* @.arr2, i32 0, i32 0), i16* null], align 2 @.arr4 = private unnamed_addr constant [2 x i16] [i16 3, i16 4], align 16 +@.zerosize = private unnamed_addr constant [0 x i16] zeroinitializer, align 4 ; CHECK-LABEL: @test1 ; CHECK: adr r0, [[x:.*]] @@ -134,18 +140,56 @@ define void @test9() #0 { ret void } +; Ensure that zero sized values are supported / not promoted. +; CHECK-LABEL: @pr32130 +; CHECK-NOT: adr +define void @pr32130() #0 { + tail call void @c(i16* getelementptr inbounds ([0 x i16], [0 x i16]* @.zerosize, i32 0, i32 0)) #2 + ret void +} + +; CHECK-LABEL: @test10 +; CHECK-V6M: adr r{{[0-9]*}}, [[x:.*]] +; CHECK-V6M: [[x]]: +; CHECK-V6M: .asciz "s\000\000" +; CHECK-V7: ldrb{{(.w)?}} r{{[0-9]*}}, [[x:.*]] +; CHECK-V7: [[x]]: +; CHECK-V7: .asciz "s\000\000" +define void @test10(i8* %a) local_unnamed_addr #0 { + call void @llvm.memmove.p0i8.p0i8.i32(i8* %a, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str, i32 0, i32 0), i32 1, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: @test11 +; CHECK-V6M: adr r{{[0-9]*}}, [[x:.*]] +; CHECK-V6M: [[x]]: +; CHECK-V6M: .short 3 +; CHECK-V6M: .short 4 +; CHECK-V7THUMB: ldrh{{(.w)?}} r{{[0-9]*}}, [[x:.*]] +; CHECK-V7THUMB: [[x]]: +; CHECK-V7THUMB: .short 3 +; CHECK-V7THUMB: .short 4 +; CHECK-V7ARM: adr r{{[0-9]*}}, [[x:.*]] +; CHECK-V7ARM: [[x]]: +; CHECK-V7ARM: .short 3 +; CHECK-V7ARM: .short 4 +define void @test11(i16* %a) local_unnamed_addr #0 { + call void @llvm.memmove.p0i16.p0i16.i32(i16* %a, i16* getelementptr inbounds ([2 x i16], [2 x i16]* @.arr1, i32 0, i32 0), i32 2, i32 2, i1 false) + ret void +} + declare void @b(i8*) #1 declare void @c(i16*) #1 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1) +declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1) local_unnamed_addr +declare void @llvm.memmove.p0i16.p0i16.i32(i16*, i16*, i32, i32, i1) local_unnamed_addr attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{i32 1, !"min_enum_size", i32 4} -!2 = !{!"Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)"} diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll index 2987b9a2105a..197746c5f122 100644 --- a/test/CodeGen/ARM/debug-info-s16-reg.ll +++ b/test/CodeGen/ARM/debug-info-s16-reg.ll @@ -3,8 +3,6 @@ ; Test dwarf reg no for s16 ;CHECK: super-register DW_OP_regx ;CHECK-NEXT: 264 -;CHECK-NEXT: DW_OP_piece -;CHECK-NEXT: 4 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32" target triple = "thumbv7-apple-macosx10.6.7" diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll index b31d1b7bed4f..094b10499788 100644 --- a/test/CodeGen/ARM/debug-info-sreg2.ll +++ b/test/CodeGen/ARM/debug-info-sreg2.ll @@ -10,7 +10,7 @@ target triple = "thumbv7-apple-macosx10.6.7" ; CHECK: 0x00000000: Beginning address offset: ; CHECK-NEXT: Ending address offset: -; CHECK-NEXT: Location description: 90 {{.. .. .. .. $}} +; CHECK-NEXT: Location description: 90 {{.. .. $}} define void @_Z3foov() optsize ssp !dbg !1 { entry: diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll index 997f50760f3a..883731519755 100644 --- a/test/CodeGen/ARM/div.ll +++ b/test/CodeGen/ARM/div.ll @@ -10,12 +10,18 @@ ; RUN: FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HWDIV ; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 | \ ; RUN: FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-EABI +; RUN: llc < %s -mtriple=armv7ve-none-linux-gnu | \ +; RUN: FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HWDIV +; RUN: llc < %s -mtriple=thumbv7ve-none-linux-gnu | \ +; RUN: FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HWDIV \ +; RUN: -check-prefix=CHECK-THUMB define i32 @f1(i32 %a, i32 %b) { entry: ; CHECK-LABEL: f1 ; CHECK-SWDIV: __divsi3 +; CHECK-THUMB: .thumb_func ; CHECK-HWDIV: sdiv ; CHECK-EABI: __aeabi_idiv @@ -28,6 +34,7 @@ entry: ; CHECK-LABEL: f2 ; CHECK-SWDIV: __udivsi3 +; CHECK-THUMB: .thumb_func ; CHECK-HWDIV: udiv ; CHECK-EABI: __aeabi_uidiv @@ -40,6 +47,7 @@ entry: ; CHECK-LABEL: f3 ; CHECK-SWDIV: __modsi3 +; CHECK-THUMB: .thumb_func ; CHECK-HWDIV: sdiv ; CHECK-HWDIV: mls @@ -55,6 +63,7 @@ entry: ; CHECK-LABEL: f4 ; CHECK-SWDIV: __umodsi3 +; CHECK-THUMB: .thumb_func ; CHECK-HWDIV: udiv ; CHECK-HWDIV: mls diff --git a/test/CodeGen/ARM/fast-isel-align.ll b/test/CodeGen/ARM/fast-isel-align.ll index 701884e926a8..71cd73a4a25d 100644 --- a/test/CodeGen/ARM/fast-isel-align.ll +++ b/test/CodeGen/ARM/fast-isel-align.ll @@ -72,10 +72,10 @@ entry: %4 = fcmp une float %3, 0.000000e+00 ; ARM: ldr r[[R:[0-9]+]], [r0, #2] ; ARM: vmov s0, r[[R]] -; ARM: vcmpe.f32 s0, #0 +; ARM: vcmp.f32 s0, #0 ; THUMB: ldr.w r[[R:[0-9]+]], [r0, #2] ; THUMB: vmov s0, r[[R]] -; THUMB: vcmpe.f32 s0, #0 +; THUMB: vcmp.f32 s0, #0 ret i1 %4 } diff --git a/test/CodeGen/ARM/fast-isel-cmp-imm.ll b/test/CodeGen/ARM/fast-isel-cmp-imm.ll index a9d7e4580638..543b6c285f3f 100644 --- a/test/CodeGen/ARM/fast-isel-cmp-imm.ll +++ b/test/CodeGen/ARM/fast-isel-cmp-imm.ll @@ -7,8 +7,8 @@ entry: ; ARM: t1a ; THUMB: t1a %cmp = fcmp oeq float %a, 0.000000e+00 -; ARM: vcmpe.f32 s{{[0-9]+}}, #0 -; THUMB: vcmpe.f32 s{{[0-9]+}}, #0 +; ARM: vcmp.f32 s{{[0-9]+}}, #0 +; THUMB: vcmp.f32 s{{[0-9]+}}, #0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry @@ -28,9 +28,9 @@ entry: ; THUMB: t1b %cmp = fcmp oeq float %a, -0.000000e+00 ; ARM: vldr -; ARM: vcmpe.f32 s{{[0-9]+}}, s{{[0-9]+}} +; ARM: vcmp.f32 s{{[0-9]+}}, s{{[0-9]+}} ; THUMB: vldr -; THUMB: vcmpe.f32 s{{[0-9]+}}, s{{[0-9]+}} +; THUMB: vcmp.f32 s{{[0-9]+}}, s{{[0-9]+}} br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry @@ -46,8 +46,8 @@ entry: ; ARM: t2a ; THUMB: t2a %cmp = fcmp oeq double %a, 0.000000e+00 -; ARM: vcmpe.f64 d{{[0-9]+}}, #0 -; THUMB: vcmpe.f64 d{{[0-9]+}}, #0 +; ARM: vcmp.f64 d{{[0-9]+}}, #0 +; THUMB: vcmp.f64 d{{[0-9]+}}, #0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry @@ -65,9 +65,9 @@ entry: ; THUMB: t2b %cmp = fcmp oeq double %a, -0.000000e+00 ; ARM: vldr -; ARM: vcmpe.f64 d{{[0-9]+}}, d{{[0-9]+}} +; ARM: vcmp.f64 d{{[0-9]+}}, d{{[0-9]+}} ; THUMB: vldr -; THUMB: vcmpe.f64 d{{[0-9]+}}, d{{[0-9]+}} +; THUMB: vcmp.f64 d{{[0-9]+}}, d{{[0-9]+}} br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll index 442459bc0582..eb32ee54c095 100644 --- a/test/CodeGen/ARM/fold-stack-adjust.ll +++ b/test/CodeGen/ARM/fold-stack-adjust.ll @@ -135,7 +135,7 @@ define void @test_fold_point(i1 %tst) minsize { ; Important to check for beginning of basic block, because if it gets ; if-converted the test is probably no longer checking what it should. -; CHECK: {{LBB[0-9]+_2}}: +; CHECK: %end ; CHECK-NEXT: vpop {d7, d8} ; CHECK-NEXT: pop {r4, pc} diff --git a/test/CodeGen/ARM/fp-only-sp.ll b/test/CodeGen/ARM/fp-only-sp.ll new file mode 100644 index 000000000000..2c7b2acbde9c --- /dev/null +++ b/test/CodeGen/ARM/fp-only-sp.ll @@ -0,0 +1,62 @@ +; RUN: llc -mtriple=thumbv7em-apple-macho -mcpu=cortex-m4 %s -o - -O0 | FileCheck %s +; RUN: llc -mtriple=thumbv7em-apple-macho -mcpu=cortex-m4 %s -o - | FileCheck %s + +; Note: vldr and vstr really do have 64-bit variants even with fp-only-sp +define void @test_load_store(double* %addr) { +; CHECK-LABEL: test_load_store: +; CHECK: vldr [[TMP:d[0-9]+]], [r0] +; CHECK: vstr [[TMP]], [r0] + %val = load volatile double, double* %addr + store volatile double %val, double* %addr + ret void +} + +define void @test_cmp(double %l, double %r, i1* %addr.dst) { +; CHECK-LABEL: test_cmp: +; CHECK: bl ___eqdf2 + %res = fcmp oeq double %l, %r + store i1 %res, i1* %addr.dst + ret void +} + +define void @test_ext(float %in, double* %addr) { +; CHECK-LABEL: test_ext: +; CHECK: bl ___extendsfdf2 + %res = fpext float %in to double + store double %res, double* %addr + ret void +} + +define void @test_trunc(double %in, float* %addr) { +; CHECK-LABEL: test_trunc: +; CHECK: bl ___truncdfsf2 + %res = fptrunc double %in to float + store float %res, float* %addr + ret void +} + +define void @test_itofp(i32 %in, double* %addr) { +; CHECK-LABEL: test_itofp: +; CHECK: bl ___floatsidf + %res = sitofp i32 %in to double + store double %res, double* %addr +; %res = fptoui double %tmp to i32 + ret void +} + +define i32 @test_fptoi(double* %addr) { +; CHECK-LABEL: test_fptoi: +; CHECK: bl ___fixunsdfsi + %val = load double, double* %addr + %res = fptoui double %val to i32 + ret i32 %res +} + +define void @test_binop(double* %addr) { +; CHECK-LABEL: test_binop: +; CHECK: bl ___adddf3 + %in = load double, double* %addr + %res = fadd double %in, %in + store double %res, double* %addr + ret void +} diff --git a/test/CodeGen/ARM/fp16-promote.ll b/test/CodeGen/ARM/fp16-promote.ll index 824123687287..9148ac109ae3 100644 --- a/test/CodeGen/ARM/fp16-promote.ll +++ b/test/CodeGen/ARM/fp16-promote.ll @@ -161,14 +161,14 @@ define void @test_select(half* %p, half* %q, i1 zeroext %c) #0 { ret void } -; Test only two variants of fcmp. These get translated to f32 vcmpe +; Test only two variants of fcmp. These get translated to f32 vcmp ; instructions anyway. ; CHECK-ALL-LABEL: test_fcmp_une: ; CHECK-FP16: vcvtb.f32.f16 ; CHECK-FP16: vcvtb.f32.f16 ; CHECK-LIBCALL: bl __aeabi_h2f ; CHECK-LIBCALL: bl __aeabi_h2f -; CHECK-VFP: vcmpe.f32 +; CHECK-VFP: vcmp.f32 ; CHECK-NOVFP: bl __aeabi_fcmpeq ; CHECK-FP16: vmrs APSR_nzcv, fpscr ; CHECK-ALL: movw{{ne|eq}} @@ -184,7 +184,7 @@ define i1 @test_fcmp_une(half* %p, half* %q) #0 { ; CHECK-FP16: vcvtb.f32.f16 ; CHECK-LIBCALL: bl __aeabi_h2f ; CHECK-LIBCALL: bl __aeabi_h2f -; CHECK-VFP: vcmpe.f32 +; CHECK-VFP: vcmp.f32 ; CHECK-NOVFP: bl __aeabi_fcmpeq ; CHECK-FP16: vmrs APSR_nzcv, fpscr ; CHECK-LIBCALL: movw{{ne|eq}} @@ -597,7 +597,7 @@ define void @test_fma(half* %p, half* %q, half* %r) #0 { ; CHECK-FP16: vcvtb.f16.f32 ; CHECK-LIBCALL-LABEL: test_fabs: ; CHECK-LIBCALL: bl __aeabi_h2f -; CHECK-LIBCALL: bfc +; CHECK-LIBCALL: bic ; CHECK-LIBCALL: bl __aeabi_f2h define void @test_fabs(half* %p) { %a = load half, half* %p, align 2 @@ -643,10 +643,11 @@ define void @test_maxnum(half* %p, half* %q) #0 { } ; CHECK-ALL-LABEL: test_minnan: -; CHECK-FP16: vcvtb.f32.f16 +; CHECK-FP16: vmov.f32 s0, #1.000000e+00 ; CHECK-FP16: vcvtb.f32.f16 ; CHECK-LIBCALL: bl __aeabi_h2f -; CHECK-LIBCALL: bl __aeabi_h2f +; CHECK-LIBCALL-VFP: vmov.f32 s{{[0-9]+}}, #1.000000e+00 +; CHECK-NOVFP: mov r{{[0-9]+}}, #1065353216 ; CHECK-VFP: vmin.f32 ; CHECK-NOVFP: bl __aeabi_fcmpge ; CHECK-FP16: vcvtb.f16.f32 @@ -660,10 +661,11 @@ define void @test_minnan(half* %p) #0 { } ; CHECK-ALL-LABEL: test_maxnan: +; CHECK-FP16: vmov.f32 s0, #1.000000e+00 ; CHECK-FP16: vcvtb.f32.f16 -; CHECK-FP16: vcvtb.f32.f16 -; CHECK-LIBCALL: bl __aeabi_h2f ; CHECK-LIBCALL: bl __aeabi_h2f +; CHECK-LIBCALL-VFP: vmov.f32 s0, #1.000000e+00 +; CHECK-NOVFP: mov r{{[0-9]+}}, #1065353216 ; CHECK-VFP: vmax.f32 ; CHECK-NOVFP: bl __aeabi_fcmple ; CHECK-FP16: vcvtb.f16.f32 @@ -685,7 +687,7 @@ define void @test_maxnan(half* %p) #0 { ; CHECK-LIBCALL: bl __aeabi_h2f ; CHECK-LIBCALL: bl __aeabi_h2f ; CHECK-VFP-LIBCALL: vbsl -; CHECK-NOVFP: bfc +; CHECK-NOVFP: bic ; CHECK-NOVFP: and ; CHECK-NOVFP: orr ; CHECK-LIBCALL: bl __aeabi_f2h @@ -845,21 +847,15 @@ define void @test_insertelement(half* %p, <4 x half>* %q, i32 %i) #0 { } ; CHECK-ALL-LABEL: test_extractelement: +; CHECK-VFP: push {{{.*}}, lr} ; CHECK-VFP: sub sp, sp, #8 -; CHECK-VFP: ldrh -; CHECK-VFP: ldrh -; CHECK-VFP: orr -; CHECK-VFP: str -; CHECK-VFP: ldrh -; CHECK-VFP: ldrh -; CHECK-VFP: orr -; CHECK-VFP: str +; CHECK-VFP: ldrd ; CHECK-VFP: mov ; CHECK-VFP: orr ; CHECK-VFP: ldrh ; CHECK-VFP: strh ; CHECK-VFP: add sp, sp, #8 -; CHECK-VFP: bx lr +; CHECK-VFP: pop {{{.*}}, pc} ; CHECK-NOVFP: ldrh ; CHECK-NOVFP: strh ; CHECK-NOVFP: ldrh diff --git a/test/CodeGen/ARM/fp16-v3.ll b/test/CodeGen/ARM/fp16-v3.ll index e26455e61e7f..a37f71d9ba88 100644 --- a/test/CodeGen/ARM/fp16-v3.ll +++ b/test/CodeGen/ARM/fp16-v3.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "armv7a--none-eabi" ; CHECK-LABEL: test_vec3: -; CHECK-DAG: vcvtb.f32.f16 [[SREG1:s[0-9]+]], +; CHECK-DAG: vmov.f32 [[SREG1:s[0-9]+]], #1.200000e+01 ; CHECK-DAG: vcvt.f32.s32 [[SREG2:s[0-9]+]], ; CHECK-DAG: vcvtb.f16.f32 [[SREG3:s[0-9]+]], [[SREG2]] ; CHECK-DAG: vcvtb.f32.f16 [[SREG4:s[0-9]+]], [[SREG3]] diff --git a/test/CodeGen/ARM/fpcmp-opt.ll b/test/CodeGen/ARM/fpcmp-opt.ll index 45bb6d2f702d..a82854109450 100644 --- a/test/CodeGen/ARM/fpcmp-opt.ll +++ b/test/CodeGen/ARM/fpcmp-opt.ll @@ -10,7 +10,7 @@ entry: ; CHECK-LABEL: t1: ; CHECK: vldr [[S0:s[0-9]+]], ; CHECK: vldr [[S1:s[0-9]+]], -; CHECK: vcmpe.f32 [[S1]], [[S0]] +; CHECK: vcmp.f32 [[S1]], [[S0]] ; CHECK: vmrs APSR_nzcv, fpscr ; CHECK: beq %0 = load float, float* %a @@ -35,10 +35,10 @@ entry: ; CHECK-NOT: vldr ; CHECK: ldrd [[REG1:(r[0-9]+)]], [[REG2:(r[0-9]+)]], [r0] ; CHECK-NOT: b LBB -; CHECK: bfc [[REG2]], #31, #1 +; CHECK: bic [[REG2]], [[REG2]], #-2147483648 ; CHECK: cmp [[REG1]], #0 ; CHECK: cmpeq [[REG2]], #0 -; CHECK-NOT: vcmpe.f32 +; CHECK-NOT: vcmp.f32 ; CHECK-NOT: vmrs ; CHECK: bne %0 = load double, double* %a @@ -61,7 +61,7 @@ entry: ; CHECK: ldr [[REG3:(r[0-9]+)]], [r0] ; CHECK: mvn [[REG4:(r[0-9]+)]], #-2147483648 ; CHECK: tst [[REG3]], [[REG4]] -; CHECK-NOT: vcmpe.f32 +; CHECK-NOT: vcmp.f32 ; CHECK-NOT: vmrs ; CHECK: bne %0 = load float, float* %a diff --git a/test/CodeGen/ARM/fpcmp.ll b/test/CodeGen/ARM/fpcmp.ll index e3ffd45a396d..67326e000169 100644 --- a/test/CodeGen/ARM/fpcmp.ll +++ b/test/CodeGen/ARM/fpcmp.ll @@ -12,7 +12,7 @@ entry: define i32 @f2(float %a) { ;CHECK-LABEL: f2: -;CHECK: vcmpe.f32 +;CHECK: vcmp.f32 ;CHECK: moveq entry: %tmp = fcmp oeq float %a, 1.000000e+00 ; [#uses=1] @@ -52,7 +52,7 @@ entry: define i32 @f6(float %a) { ;CHECK-LABEL: f6: -;CHECK: vcmpe.f32 +;CHECK: vcmp.f32 ;CHECK: movne entry: %tmp = fcmp une float %a, 1.000000e+00 ; [#uses=1] diff --git a/test/CodeGen/ARM/fpcmp_ueq.ll b/test/CodeGen/ARM/fpcmp_ueq.ll index c1696c9be1b7..698c7506cc59 100644 --- a/test/CodeGen/ARM/fpcmp_ueq.ll +++ b/test/CodeGen/ARM/fpcmp_ueq.ll @@ -17,7 +17,7 @@ entry: ; CHECK-ARMv4: moveq r0, #42 ; CHECK-ARMv7-LABEL: f7: -; CHECK-ARMv7: vcmpe.f32 +; CHECK-ARMv7: vcmp.f32 ; CHECK-ARMv7: vmrs APSR_nzcv, fpscr ; CHECK-ARMv7: movweq ; CHECK-ARMv7-NOT: vmrs diff --git a/test/CodeGen/ARM/fpscr-intrinsics.ll b/test/CodeGen/ARM/fpscr-intrinsics.ll new file mode 100644 index 000000000000..64b97525febf --- /dev/null +++ b/test/CodeGen/ARM/fpscr-intrinsics.ll @@ -0,0 +1,44 @@ +; RUN: llc < %s -O0 -mtriple=armv7-eabi -mcpu=cortex-a8 -mattr=+neon,+fp-armv8 | FileCheck %s +; RUN: llc < %s -O3 -mtriple=armv7-eabi -mcpu=cortex-a8 -mattr=+neon,+fp-armv8 | FileCheck %s + +@a = common global double 0.000000e+00, align 8 + +; Function Attrs: noinline nounwind uwtable +define void @strtod() { +entry: + ; CHECK: vmrs r{{[0-9]+}}, fpscr + %0 = call i32 @llvm.flt.rounds() + %tobool = icmp ne i32 %0, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %entry + store double 5.000000e-01, double* @a, align 8 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: nounwind +define void @fn1(i32* nocapture %p) local_unnamed_addr { +entry: + ; CHECK: vmrs r{{[0-9]+}}, fpscr + %0 = tail call i32 @llvm.arm.get.fpscr() + store i32 %0, i32* %p, align 4 + ; CHECK: vmsr fpscr, r{{[0-9]+}} + tail call void @llvm.arm.set.fpscr(i32 1) + ; CHECK: vmrs r{{[0-9]+}}, fpscr + %1 = tail call i32 @llvm.arm.get.fpscr() + %arrayidx1 = getelementptr inbounds i32, i32* %p, i32 1 + store i32 %1, i32* %arrayidx1, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i32 @llvm.arm.get.fpscr() + +; Function Attrs: nounwind writeonly +declare void @llvm.arm.set.fpscr(i32) + +; Function Attrs: nounwind +declare i32 @llvm.flt.rounds() diff --git a/test/CodeGen/ARM/gpr-paired-spill.ll b/test/CodeGen/ARM/gpr-paired-spill.ll index ef3e5a54a2db..797b147d5d01 100644 --- a/test/CodeGen/ARM/gpr-paired-spill.ll +++ b/test/CodeGen/ARM/gpr-paired-spill.ll @@ -16,22 +16,22 @@ define void @foo(i64* %addr) { ; an LDMIA was created with both a FrameIndex and an offset, which ; is not allowed. -; CHECK-WITH-LDRD: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8] -; CHECK-WITH-LDRD: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp] +; CHECK-WITH-LDRD-DAG: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8] +; CHECK-WITH-LDRD-DAG: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp] -; CHECK-WITH-LDRD: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8] -; CHECK-WITH-LDRD: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp] +; CHECK-WITH-LDRD-DAG: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8] +; CHECK-WITH-LDRD-DAG: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp] ; We also want to ensure the register scavenger is working (i.e. an ; offset from sp can be generated), so we need two spills. -; CHECK-WITHOUT-LDRD: add [[ADDRREG:[a-z0-9]+]], sp, #{{[0-9]+}} -; CHECK-WITHOUT-LDRD: stm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}} -; CHECK-WITHOUT-LDRD: stm sp, {r{{[0-9]+}}, r{{[0-9]+}}} +; CHECK-WITHOUT-LDRD-DAG: add [[ADDRREG:[a-z0-9]+]], sp, #{{[0-9]+}} +; CHECK-WITHOUT-LDRD-DAG: stm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}} +; CHECK-WITHOUT-LDRD-DAG: stm sp, {r{{[0-9]+}}, r{{[0-9]+}}} ; In principle LLVM may have to recalculate the offset. At the moment ; it reuses the original though. -; CHECK-WITHOUT-LDRD: ldm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}} -; CHECK-WITHOUT-LDRD: ldm sp, {r{{[0-9]+}}, r{{[0-9]+}}} +; CHECK-WITHOUT-LDRD-DAG: ldm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}} +; CHECK-WITHOUT-LDRD-DAG: ldm sp, {r{{[0-9]+}}, r{{[0-9]+}}} store volatile i64 %val1, i64* %addr store volatile i64 %val2, i64* %addr diff --git a/test/CodeGen/ARM/ifcvt10.ll b/test/CodeGen/ARM/ifcvt10.ll index 5725a404c320..c7e18d35dbee 100644 --- a/test/CodeGen/ARM/ifcvt10.ll +++ b/test/CodeGen/ARM/ifcvt10.ll @@ -9,8 +9,6 @@ entry: ; CHECK-LABEL: t: ; CHECK: vpop {d8} ; CHECK-NOT: vpopne -; CHECK: pop {r7, pc} -; CHECK: vpop {d8} ; CHECK: pop {r7, pc} br i1 undef, label %if.else, label %if.then diff --git a/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/test/CodeGen/ARM/illegal-bitfield-loadstore.ll new file mode 100644 index 000000000000..74117d3896bd --- /dev/null +++ b/test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -0,0 +1,184 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-eabi | FileCheck %s -check-prefix=LE +; RUN: llc < %s -mtriple=armeb-eabi | FileCheck %s -check-prefix=BE + +define void @i24_or(i24* %a) { +; LE-LABEL: i24_or: +; LE: @ BB#0: +; LE-NEXT: ldrh r1, [r0] +; LE-NEXT: orr r1, r1, #384 +; LE-NEXT: strh r1, [r0] +; LE-NEXT: mov pc, lr +; +; BE-LABEL: i24_or: +; BE: @ BB#0: +; BE-NEXT: ldrh r1, [r0] +; BE-NEXT: ldrb r2, [r0, #2] +; BE-NEXT: orr r1, r2, r1, lsl #8 +; BE-NEXT: orr r1, r1, #384 +; BE-NEXT: strb r1, [r0, #2] +; BE-NEXT: lsr r1, r1, #8 +; BE-NEXT: strh r1, [r0] +; BE-NEXT: mov pc, lr + %aa = load i24, i24* %a, align 1 + %b = or i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +define void @i24_and_or(i24* %a) { +; LE-LABEL: i24_and_or: +; LE: @ BB#0: +; LE-NEXT: ldrh r1, [r0] +; LE-NEXT: mov r2, #16256 +; LE-NEXT: orr r2, r2, #49152 +; LE-NEXT: orr r1, r1, #384 +; LE-NEXT: and r1, r1, r2 +; LE-NEXT: strh r1, [r0] +; LE-NEXT: mov pc, lr +; +; BE-LABEL: i24_and_or: +; BE: @ BB#0: +; BE-NEXT: mov r1, #128 +; BE-NEXT: strb r1, [r0, #2] +; BE-NEXT: ldrh r1, [r0] +; BE-NEXT: orr r1, r1, #1 +; BE-NEXT: strh r1, [r0] +; BE-NEXT: mov pc, lr + %b = load i24, i24* %a, align 1 + %c = and i24 %b, -128 + %d = or i24 %c, 384 + store i24 %d, i24* %a, align 1 + ret void +} + +define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { +; LE-LABEL: i24_insert_bit: +; LE: @ BB#0: +; LE-NEXT: ldrh r2, [r0] +; LE-NEXT: mov r3, #255 +; LE-NEXT: orr r3, r3, #57088 +; LE-NEXT: and r2, r2, r3 +; LE-NEXT: orr r1, r2, r1, lsl #13 +; LE-NEXT: strh r1, [r0] +; LE-NEXT: mov pc, lr +; +; BE-LABEL: i24_insert_bit: +; BE: @ BB#0: +; BE-NEXT: ldrh r2, [r0] +; BE-NEXT: mov r3, #57088 +; BE-NEXT: orr r3, r3, #16711680 +; BE-NEXT: and r2, r3, r2, lsl #8 +; BE-NEXT: orr r1, r2, r1, lsl #13 +; BE-NEXT: lsr r1, r1, #8 +; BE-NEXT: strh r1, [r0] +; BE-NEXT: mov pc, lr + %extbit = zext i1 %bit to i24 + %b = load i24, i24* %a, align 1 + %extbit.shl = shl nuw nsw i24 %extbit, 13 + %c = and i24 %b, -8193 + %d = or i24 %c, %extbit.shl + store i24 %d, i24* %a, align 1 + ret void +} + +define void @i56_or(i56* %a) { +; LE-LABEL: i56_or: +; LE: @ BB#0: +; LE-NEXT: ldr r1, [r0] +; LE-NEXT: orr r1, r1, #384 +; LE-NEXT: str r1, [r0] +; LE-NEXT: mov pc, lr +; +; BE-LABEL: i56_or: +; BE: @ BB#0: +; BE-NEXT: mov r1, r0 +; BE-NEXT: ldr r12, [r0] +; BE-NEXT: ldrh r2, [r1, #4]! +; BE-NEXT: ldrb r3, [r1, #2] +; BE-NEXT: orr r2, r3, r2, lsl #8 +; BE-NEXT: orr r2, r2, r12, lsl #24 +; BE-NEXT: orr r2, r2, #384 +; BE-NEXT: lsr r3, r2, #8 +; BE-NEXT: strb r2, [r1, #2] +; BE-NEXT: strh r3, [r1] +; BE-NEXT: bic r1, r12, #255 +; BE-NEXT: orr r1, r1, r2, lsr #24 +; BE-NEXT: str r1, [r0] +; BE-NEXT: mov pc, lr + %aa = load i56, i56* %a + %b = or i56 %aa, 384 + store i56 %b, i56* %a + ret void +} + +define void @i56_and_or(i56* %a) { +; LE-LABEL: i56_and_or: +; LE: @ BB#0: +; LE-NEXT: ldr r1, [r0] +; LE-NEXT: orr r1, r1, #384 +; LE-NEXT: bic r1, r1, #127 +; LE-NEXT: str r1, [r0] +; LE-NEXT: mov pc, lr +; +; BE-LABEL: i56_and_or: +; BE: @ BB#0: +; BE-NEXT: mov r1, r0 +; BE-NEXT: mov r3, #128 +; BE-NEXT: ldrh r2, [r1, #4]! +; BE-NEXT: strb r3, [r1, #2] +; BE-NEXT: lsl r2, r2, #8 +; BE-NEXT: ldr r12, [r0] +; BE-NEXT: orr r2, r2, r12, lsl #24 +; BE-NEXT: orr r2, r2, #384 +; BE-NEXT: lsr r3, r2, #8 +; BE-NEXT: strh r3, [r1] +; BE-NEXT: bic r1, r12, #255 +; BE-NEXT: orr r1, r1, r2, lsr #24 +; BE-NEXT: str r1, [r0] +; BE-NEXT: mov pc, lr + + %b = load i56, i56* %a, align 1 + %c = and i56 %b, -128 + %d = or i56 %c, 384 + store i56 %d, i56* %a, align 1 + ret void +} + +define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { +; LE-LABEL: i56_insert_bit: +; LE: @ BB#0: +; LE-NEXT: ldr r2, [r0] +; LE-NEXT: bic r2, r2, #8192 +; LE-NEXT: orr r1, r2, r1, lsl #13 +; LE-NEXT: str r1, [r0] +; LE-NEXT: mov pc, lr +; +; BE-LABEL: i56_insert_bit: +; BE: @ BB#0: +; BE-NEXT: .save {r11, lr} +; BE-NEXT: push {r11, lr} +; BE-NEXT: mov r2, r0 +; BE-NEXT: ldr lr, [r0] +; BE-NEXT: ldrh r12, [r2, #4]! +; BE-NEXT: ldrb r3, [r2, #2] +; BE-NEXT: orr r12, r3, r12, lsl #8 +; BE-NEXT: orr r3, r12, lr, lsl #24 +; BE-NEXT: bic r3, r3, #8192 +; BE-NEXT: orr r1, r3, r1, lsl #13 +; BE-NEXT: lsr r3, r1, #8 +; BE-NEXT: strh r3, [r2] +; BE-NEXT: bic r2, lr, #255 +; BE-NEXT: orr r1, r2, r1, lsr #24 +; BE-NEXT: str r1, [r0] +; BE-NEXT: pop {r11, lr} +; BE-NEXT: mov pc, lr + %extbit = zext i1 %bit to i56 + %b = load i56, i56* %a, align 1 + %extbit.shl = shl nuw nsw i56 %extbit, 13 + %c = and i56 %b, -8193 + %d = or i56 %c, %extbit.shl + store i56 %d, i56* %a, align 1 + ret void +} + diff --git a/test/CodeGen/ARM/indirectbr.ll b/test/CodeGen/ARM/indirectbr.ll index d15ef14b4493..90defad43a7d 100644 --- a/test/CodeGen/ARM/indirectbr.ll +++ b/test/CodeGen/ARM/indirectbr.ll @@ -47,6 +47,7 @@ L3: ; preds = %L4, %bb2 br label %L2 L2: ; preds = %L3, %bb2 +; THUMB-LABEL: %L1.clone ; THUMB: muls %res.2 = phi i32 [ %res.1, %L3 ], [ 1, %bb2 ] ; [#uses=1] %phitmp = mul i32 %res.2, 6 ; [#uses=1] diff --git a/test/CodeGen/ARM/interval-update-remat.ll b/test/CodeGen/ARM/interval-update-remat.ll index 6391d4c29604..524e8a0aa491 100644 --- a/test/CodeGen/ARM/interval-update-remat.ll +++ b/test/CodeGen/ARM/interval-update-remat.ll @@ -109,7 +109,7 @@ _ZN7MessageD1Ev.exit: ; preds = %if.then.i.i.i.i, %i } ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #0 +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 declare %class.StaticSocketDataProvider.6.231.281.1306.2331* @_ZN24StaticSocketDataProviderC1EP13MockReadWritejS1_j(%class.StaticSocketDataProvider.6.231.281.1306.2331* returned, %struct.MockReadWrite.7.232.282.1307.2332*, i32, %struct.MockReadWrite.7.232.282.1307.2332*, i32) unnamed_addr @@ -130,7 +130,7 @@ declare %class.Message.13.238.288.1313.2338* @_ZN7MessageC1Ev(%class.Message.13. declare %class.AssertHelper.10.235.285.1310.2335* @_ZN12AssertHelperD1Ev(%class.AssertHelper.10.235.285.1310.2335* returned) unnamed_addr ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #0 +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 declare void @_ZN18ClientSocketHandle5m_fn3IPiEEvRK25Trans_NS___1_basic_stringIciiE13scoped_refptr15RequestPriorityN16ClientSocketPool13RespectLimitsERiT_11BoundNetLog(%class.ClientSocketHandle.14.239.289.1314.2339*, %class.Trans_NS___1_basic_string.18.243.293.1318.2343* dereferenceable(12), %class.scoped_refptr.19.244.294.1319.2344*, i32, i32, i32* dereferenceable(4), i32*, %class.BoundNetLog.20.245.295.1320.2345*) diff --git a/test/CodeGen/ARM/intrinsics-coprocessor.ll b/test/CodeGen/ARM/intrinsics-coprocessor.ll index 8fea49b39fb6..5352471238f9 100644 --- a/test/CodeGen/ARM/intrinsics-coprocessor.ll +++ b/test/CodeGen/ARM/intrinsics-coprocessor.ll @@ -1,5 +1,4 @@ ; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 | FileCheck %s -; RUN: llc < %s -march=thumb -mtriple=thumbv7-eabi -mcpu=cortex-a8 | FileCheck %s define void @coproc(i8* %i) nounwind { entry: diff --git a/test/CodeGen/ARM/ldm-stm-i256.ll b/test/CodeGen/ARM/ldm-stm-i256.ll new file mode 100644 index 000000000000..7b4151dabf6d --- /dev/null +++ b/test/CodeGen/ARM/ldm-stm-i256.ll @@ -0,0 +1,38 @@ +; RUN: llc -mtriple=armv7--eabi -verify-machineinstrs < %s | FileCheck %s + +; Check the way we schedule/merge a bunch of loads and stores. +; Originally test/CodeGen/ARM/2011-07-07-ScheduleDAGCrash.ll ; now +; being used as a test of optimizations related to ldm/stm. + +; FIXME: We could merge more loads/stores with regalloc hints. +; FIXME: Fix scheduling so we don't have 16 live registers. + +define void @f(i256* nocapture %a, i256* nocapture %b, i256* nocapture %cc, i256* nocapture %dd) nounwind uwtable noinline ssp { +entry: + %c = load i256, i256* %cc + %d = load i256, i256* %dd + %add = add nsw i256 %c, %d + store i256 %add, i256* %a, align 8 + %or = or i256 %c, 1606938044258990275541962092341162602522202993782792835301376 + %add6 = add nsw i256 %or, %d + store i256 %add6, i256* %b, align 8 + ret void + ; CHECK-DAG: ldm r3 + ; CHECK-DAG: ldm r2 + ; CHECK-DAG: ldr {{.*}}, [r3, #20] + ; CHECK-DAG: ldr {{.*}}, [r3, #16] + ; CHECK-DAG: ldr {{.*}}, [r3, #28] + ; CHECK-DAG: ldr {{.*}}, [r3, #24] + ; CHECK-DAG: ldr {{.*}}, [r2, #20] + ; CHECK-DAG: ldr {{.*}}, [r2, #16] + ; CHECK-DAG: ldr {{.*}}, [r2, #28] + ; CHECK-DAG: ldr {{.*}}, [r2, #24] + ; CHECK-DAG: stmib r0 + ; CHECK-DAG: str {{.*}}, [r0] + ; CHECK-DAG: str {{.*}}, [r0, #24] + ; CHECK-DAG: str {{.*}}, [r0, #28] + ; CHECK-DAG: str {{.*}}, [r1] + ; CHECK-DAG: stmib r1 + ; CHECK-DAG: str {{.*}}, [r1, #24] + ; CHECK-DAG: str {{.*}}, [r1, #28] +} diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll index 6a9e63f649c9..6981cfcb0855 100644 --- a/test/CodeGen/ARM/ldrd.ll +++ b/test/CodeGen/ARM/ldrd.ll @@ -80,7 +80,7 @@ return: ; preds = %bb, %entry ; CHECK-LABEL: Func1: define void @Func1() nounwind ssp "no-frame-pointer-elim"="true" { -entry: +entry: ; A8: movw [[BASE:r[0-9]+]], :lower16:{{.*}}TestVar{{.*}} ; A8: movt [[BASE]], :upper16:{{.*}}TestVar{{.*}} ; A8: ldrd [[FIELD1:r[0-9]+]], [[FIELD2:r[0-9]+]], {{\[}}[[BASE]], #4] @@ -88,12 +88,12 @@ entry: ; A8-NEXT: str [[FIELD1]], {{\[}}[[BASE]]{{\]}} ; CONSERVATIVE-NOT: ldrd %orig_blocks = alloca [256 x i16], align 2 - %0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start(i64 512, i8* %0) nounwind + %0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start.p0i8(i64 512, i8* %0) nounwind %tmp1 = load i32, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 1), align 4 %tmp2 = load i32, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 2), align 4 %add = add nsw i32 %tmp2, %tmp1 store i32 %add, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 0), align 4 - call void @llvm.lifetime.end(i64 512, i8* %0) nounwind + call void @llvm.lifetime.end.p0i8(i64 512, i8* %0) nounwind ret void } @@ -189,5 +189,23 @@ define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) "no-frame-pointer-e ret i32* %p1 } -declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind +; CHECK-LABEL: ldrd_strd_aa: +; NORMAL: ldrd [[TMP1:r[0-9]]], [[TMP2:r[0-9]]], +; NORMAL: strd [[TMP1]], [[TMP2]], +; CONSERVATIVE-NOT: ldrd +; CONSERVATIVE-NOT: strd +; CHECK: bx lr + +define void @ldrd_strd_aa(i32* noalias nocapture %x, i32* noalias nocapture readonly %y) { +entry: + %0 = load i32, i32* %y, align 4 + store i32 %0, i32* %x, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %y, i32 1 + %1 = load i32, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %x, i32 1 + store i32 %1, i32* %arrayidx3, align 4 + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind diff --git a/test/CodeGen/ARM/load-combine-big-endian.ll b/test/CodeGen/ARM/load-combine-big-endian.ll new file mode 100644 index 000000000000..8d8a0136cf96 --- /dev/null +++ b/test/CodeGen/ARM/load-combine-big-endian.ll @@ -0,0 +1,779 @@ +; RUN: llc < %s -mtriple=armeb-unknown | FileCheck %s +; RUN: llc < %s -mtriple=armv6eb-unknown | FileCheck %s --check-prefix=CHECK-ARMv6 + +; i8* p; // p is 4 byte aligned +; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_big_endian(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_big_endian: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_big_endian: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 4 + %tmp2 = zext i8 %tmp1 to i32 + %tmp3 = shl nuw nsw i32 %tmp2, 24 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 8 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = or i32 %tmp13, %tmp16 + ret i32 %tmp17 +} + +; i8* p; // p is 4 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i8_bswap(i32* %arg) { +; BSWAP is not supported by 32 bit target +; CHECK-LABEL: load_i32_by_i8_bswap: +; CHECK: ldr r0, [r0] +; CHECK: and +; CHECK-NEXT: and +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p is 4 byte aligned +; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4]) +define i32 @load_i32_by_i16_by_i8_big_endian(i32* %arg) { +; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i16_by_i8_big_endian: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 4 + %tmp2 = zext i8 %tmp1 to i16 + %tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = zext i8 %tmp4 to i16 + %tmp6 = shl nuw nsw i16 %tmp2, 8 + %tmp7 = or i16 %tmp6, %tmp5 + %tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp9 = load i8, i8* %tmp8, align 1 + %tmp10 = zext i8 %tmp9 to i16 + %tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp12 = load i8, i8* %tmp11, align 1 + %tmp13 = zext i8 %tmp12 to i16 + %tmp14 = shl nuw nsw i16 %tmp10, 8 + %tmp15 = or i16 %tmp14, %tmp13 + %tmp16 = zext i16 %tmp7 to i32 + %tmp17 = zext i16 %tmp15 to i32 + %tmp18 = shl nuw nsw i32 %tmp16, 16 + %tmp19 = or i32 %tmp18, %tmp17 + ret i32 %tmp19 +} + +; i16* p; // p is 4 byte aligned +; ((i32) p[0] << 16) | (i32) p[1] +define i32 @load_i32_by_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_i16: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i16: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp2 = zext i16 %tmp1 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp5 = zext i16 %tmp4 to i32 + %tmp6 = shl nuw nsw i32 %tmp2, 16 + %tmp7 = or i32 %tmp6, %tmp5 + ret i32 %tmp7 +} + +; i16* p_16; // p_16 is 4 byte aligned +; i8* p_8 = (i8*) p_16; +; (i32) (p_16[0] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i16_i8(i32* %arg) { +; CHECK-LABEL: load_i32_by_i16_i8: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i16_i8: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i16* + %tmp1 = bitcast i32* %arg to i8* + %tmp2 = load i16, i16* %tmp, align 4 + %tmp3 = zext i16 %tmp2 to i32 + %tmp4 = shl nuw nsw i32 %tmp3, 16 + %tmp5 = getelementptr inbounds i8, i8* %tmp1, i32 2 + %tmp6 = load i8, i8* %tmp5, align 1 + %tmp7 = zext i8 %tmp6 to i32 + %tmp8 = shl nuw nsw i32 %tmp7, 8 + %tmp9 = getelementptr inbounds i8, i8* %tmp1, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = or i32 %tmp8, %tmp11 + %tmp13 = or i32 %tmp12, %tmp4 + ret i32 %tmp13 +} + +; i8* p; // p is 8 byte aligned +; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) +define i64 @load_i64_by_i8_bswap(i64* %arg) { +; CHECK-LABEL: load_i64_by_i8_bswap: +; CHECK: ldr{{.*}}r0 +; CHECK: ldr{{.*}}r0 +; CHECK: and +; CHECK-NEXT: and +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK-NEXT: and +; CHECK-NEXT: orr +; CHECK-NEXT: and +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: +; CHECK-ARMv6: ldrd r2, r3, [r0] +; CHECK-ARMv6: rev r0, r3 +; CHECK-ARMv6: rev r1, r2 +; CHECK-ARMv6: bx lr + %tmp = bitcast i64* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 8 + %tmp2 = zext i8 %tmp1 to i64 + %tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1 + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = zext i8 %tmp4 to i64 + %tmp6 = shl nuw nsw i64 %tmp5, 8 + %tmp7 = or i64 %tmp6, %tmp2 + %tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2 + %tmp9 = load i8, i8* %tmp8, align 1 + %tmp10 = zext i8 %tmp9 to i64 + %tmp11 = shl nuw nsw i64 %tmp10, 16 + %tmp12 = or i64 %tmp7, %tmp11 + %tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3 + %tmp14 = load i8, i8* %tmp13, align 1 + %tmp15 = zext i8 %tmp14 to i64 + %tmp16 = shl nuw nsw i64 %tmp15, 24 + %tmp17 = or i64 %tmp12, %tmp16 + %tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4 + %tmp19 = load i8, i8* %tmp18, align 1 + %tmp20 = zext i8 %tmp19 to i64 + %tmp21 = shl nuw nsw i64 %tmp20, 32 + %tmp22 = or i64 %tmp17, %tmp21 + %tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5 + %tmp24 = load i8, i8* %tmp23, align 1 + %tmp25 = zext i8 %tmp24 to i64 + %tmp26 = shl nuw nsw i64 %tmp25, 40 + %tmp27 = or i64 %tmp22, %tmp26 + %tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6 + %tmp29 = load i8, i8* %tmp28, align 1 + %tmp30 = zext i8 %tmp29 to i64 + %tmp31 = shl nuw nsw i64 %tmp30, 48 + %tmp32 = or i64 %tmp27, %tmp31 + %tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7 + %tmp34 = load i8, i8* %tmp33, align 1 + %tmp35 = zext i8 %tmp34 to i64 + %tmp36 = shl nuw i64 %tmp35, 56 + %tmp37 = or i64 %tmp32, %tmp36 + ret i64 %tmp37 +} + +; i8* p; // p is 8 byte aligned +; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] +define i64 @load_i64_by_i8(i64* %arg) { +; CHECK-LABEL: load_i64_by_i8: +; CHECK: ldr r2, [r0] +; CHECK: ldr r1, [r0, #4] +; CHECK: mov r0, r2 +; CHECK: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i64_by_i8: +; CHECK-ARMv6: ldrd r0, r1, [r0] +; CHECK-ARMv6: bx lr + %tmp = bitcast i64* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 8 + %tmp2 = zext i8 %tmp1 to i64 + %tmp3 = shl nuw i64 %tmp2, 56 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i64 + %tmp7 = shl nuw nsw i64 %tmp6, 48 + %tmp8 = or i64 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i64 + %tmp12 = shl nuw nsw i64 %tmp11, 40 + %tmp13 = or i64 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i64 + %tmp17 = shl nuw nsw i64 %tmp16, 32 + %tmp18 = or i64 %tmp13, %tmp17 + %tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4 + %tmp20 = load i8, i8* %tmp19, align 1 + %tmp21 = zext i8 %tmp20 to i64 + %tmp22 = shl nuw nsw i64 %tmp21, 24 + %tmp23 = or i64 %tmp18, %tmp22 + %tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5 + %tmp25 = load i8, i8* %tmp24, align 1 + %tmp26 = zext i8 %tmp25 to i64 + %tmp27 = shl nuw nsw i64 %tmp26, 16 + %tmp28 = or i64 %tmp23, %tmp27 + %tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6 + %tmp30 = load i8, i8* %tmp29, align 1 + %tmp31 = zext i8 %tmp30 to i64 + %tmp32 = shl nuw nsw i64 %tmp31, 8 + %tmp33 = or i64 %tmp28, %tmp32 + %tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7 + %tmp35 = load i8, i8* %tmp34, align 1 + %tmp36 = zext i8 %tmp35 to i64 + %tmp37 = or i64 %tmp33, %tmp36 + ret i64 %tmp37 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +declare i16 @llvm.bswap.i16(i16) + +; i16* p; // p is 4 byte aligned +; (i32) bswap(p[0]) | (i32) bswap(p[1] << 16) +define i32 @load_i32_by_bswap_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_bswap_i16: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1) + %tmp2 = zext i16 %tmp11 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp41 = call i16 @llvm.bswap.i16(i16 %tmp4) + %tmp5 = zext i16 %tmp41 to i32 + %tmp6 = shl nuw nsw i32 %tmp5, 16 + %tmp7 = or i32 %tmp6, %tmp2 + ret i32 %tmp7 +} + +; i16* p; // p is 4 byte aligned +; (i32) p[1] | (sext(p[0] << 16) to i32) +define i32 @load_i32_by_sext_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_sext_i16: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: load_i32_by_sext_i16: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp2 = sext i16 %tmp1 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp5 = zext i16 %tmp4 to i32 + %tmp6 = shl nuw nsw i32 %tmp2, 16 + %tmp7 = or i32 %tmp6, %tmp5 + ret i32 %tmp7 +} + +; i8* arg; i32 i; +; p = arg + 12; +; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24) +define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { +; CHECK-LABEL: load_i32_by_i8_base_offset_index: +; CHECK: add r0, r0, r1 +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: ldr r0, [r0, #12] +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: +; CHECK-ARMv6: add r0, r0, r1 +; CHECK-ARMv6-NEXT: ldr r0, [r0, #12] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = add nuw nsw i32 %i, 3 + %tmp2 = add nuw nsw i32 %i, 2 + %tmp3 = add nuw nsw i32 %i, 1 + %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12 + %tmp5 = zext i32 %i to i64 + %tmp6 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp5 + %tmp7 = load i8, i8* %tmp6, align 4 + %tmp8 = zext i8 %tmp7 to i32 + %tmp9 = zext i32 %tmp3 to i64 + %tmp10 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp9 + %tmp11 = load i8, i8* %tmp10, align 1 + %tmp12 = zext i8 %tmp11 to i32 + %tmp13 = shl nuw nsw i32 %tmp12, 8 + %tmp14 = or i32 %tmp13, %tmp8 + %tmp15 = zext i32 %tmp2 to i64 + %tmp16 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp15 + %tmp17 = load i8, i8* %tmp16, align 1 + %tmp18 = zext i8 %tmp17 to i32 + %tmp19 = shl nuw nsw i32 %tmp18, 16 + %tmp20 = or i32 %tmp14, %tmp19 + %tmp21 = zext i32 %tmp to i64 + %tmp22 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp21 + %tmp23 = load i8, i8* %tmp22, align 1 + %tmp24 = zext i8 %tmp23 to i32 + %tmp25 = shl nuw i32 %tmp24, 24 + %tmp26 = or i32 %tmp20, %tmp25 + ret i32 %tmp26 +} + +; i8* arg; i32 i; +; p = arg + 12; +; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) +define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { +; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: +; CHECK: add r0, r0, r1 +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: ldr r0, [r0, #13] +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: +; CHECK-ARMv6: add r0, r0, r1 +; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = add nuw nsw i32 %i, 4 + %tmp2 = add nuw nsw i32 %i, 3 + %tmp3 = add nuw nsw i32 %i, 2 + %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12 + %tmp5 = add nuw nsw i32 %i, 1 + %tmp27 = zext i32 %tmp5 to i64 + %tmp28 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp27 + %tmp29 = load i8, i8* %tmp28, align 4 + %tmp30 = zext i8 %tmp29 to i32 + %tmp31 = zext i32 %tmp3 to i64 + %tmp32 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp31 + %tmp33 = load i8, i8* %tmp32, align 1 + %tmp34 = zext i8 %tmp33 to i32 + %tmp35 = shl nuw nsw i32 %tmp34, 8 + %tmp36 = or i32 %tmp35, %tmp30 + %tmp37 = zext i32 %tmp2 to i64 + %tmp38 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp37 + %tmp39 = load i8, i8* %tmp38, align 1 + %tmp40 = zext i8 %tmp39 to i32 + %tmp41 = shl nuw nsw i32 %tmp40, 16 + %tmp42 = or i32 %tmp36, %tmp41 + %tmp43 = zext i32 %tmp to i64 + %tmp44 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp43 + %tmp45 = load i8, i8* %tmp44, align 1 + %tmp46 = zext i8 %tmp45 to i32 + %tmp47 = shl nuw i32 %tmp46, 24 + %tmp48 = or i32 %tmp42, %tmp47 + ret i32 %tmp48 +} + +; i8* p; // p is 2 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) +define i32 @zext_load_i32_by_i8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[0] << 8) | ((i32) p[1] << 16) +define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_shl_8: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r0, r0, #16 +; CHECK-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_8: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r0, r0, #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 8 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[0] << 16) | ((i32) p[1] << 24) +define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_shl_16: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_16: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r0, r0, #24 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 16 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 24 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; (i32) p[1] | ((i32) p[0] << 8) +define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[1] << 8) | ((i32) p[0] << 16) +define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r1, r1, #16 +; CHECK-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_8: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r1, r1, #16 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 8 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[1] << 16) | ((i32) p[0] << 24) +define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r1, r1, #24 +; CHECK-NEXT: orr r0, r1, r0, lsl #16 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_16: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r1, r1, #24 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #16 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 16 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 24 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; +; i16* p1.i16 = (i16*) p; +; (p1.i16[0] << 8) | ((i16) p[2]) +; +; This is essentialy a i16 load from p[1], but we don't fold the pattern now +; because in the original DAG we don't have p[1] address available +define i16 @load_i16_from_nonzero_offset(i8* %p) { +; CHECK-LABEL: load_i16_from_nonzero_offset: +; CHECK: ldrh r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #2] +; CHECK-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: load_i16_from_nonzero_offset: +; CHECK-ARMv6: ldrh r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #2] +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: bx lr + + %p1.i16 = bitcast i8* %p to i16* + %p2.i8 = getelementptr i8, i8* %p, i64 2 + %v1 = load i16, i16* %p1.i16 + %v2.i8 = load i8, i8* %p2.i8 + %v2 = zext i8 %v2.i8 to i16 + %v1.shl = shl i16 %v1, 8 + %res = or i16 %v1.shl, %v2 + ret i16 %res +} diff --git a/test/CodeGen/ARM/load-combine.ll b/test/CodeGen/ARM/load-combine.ll new file mode 100644 index 000000000000..720bc7b88b32 --- /dev/null +++ b/test/CodeGen/ARM/load-combine.ll @@ -0,0 +1,692 @@ +; RUN: llc < %s -mtriple=arm-unknown | FileCheck %s +; RUN: llc < %s -mtriple=armv6-unknown | FileCheck %s --check-prefix=CHECK-ARMv6 + +; i8* p; // p is 1 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i8_unaligned(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_unaligned: +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: orr +; CHECK: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_unaligned: +; CHECK-ARMv6: ldrb{{.*}}r0 +; CHECK-ARMv6: ldrb{{.*}}r0 +; CHECK-ARMv6: ldrb{{.*}}r0 +; CHECK-ARMv6: ldrb{{.*}}r0 +; CHECK-ARMv6: orr +; CHECK-ARMv6: bx lr + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p is 4 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i8_aligned(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_aligned: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_aligned: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p is 4 byte aligned +; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_bswap(i32* %arg) { +; BSWAP is not supported by 32 bit target +; CHECK-LABEL: load_i32_by_i8_bswap: +; CHECK: ldr r0, [r0] +; CHECK: and +; CHECK-NEXT: and +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 4 + %tmp2 = zext i8 %tmp1 to i32 + %tmp3 = shl nuw nsw i32 %tmp2, 24 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 8 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = or i32 %tmp13, %tmp16 + ret i32 %tmp17 +} + +; i8* p; // p is 8 byte aligned +; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) +define i64 @load_i64_by_i8(i64* %arg) { +; CHECK-LABEL: load_i64_by_i8: +; CHECK: ldr r2, [r0] +; CHECK-NEXT: ldr r1, [r0, #4] +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i64_by_i8: +; CHECK-ARMv6: ldrd r0, r1, [r0] +; CHECK-ARMv6: bx lr + %tmp = bitcast i64* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 8 + %tmp2 = zext i8 %tmp1 to i64 + %tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1 + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = zext i8 %tmp4 to i64 + %tmp6 = shl nuw nsw i64 %tmp5, 8 + %tmp7 = or i64 %tmp6, %tmp2 + %tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2 + %tmp9 = load i8, i8* %tmp8, align 1 + %tmp10 = zext i8 %tmp9 to i64 + %tmp11 = shl nuw nsw i64 %tmp10, 16 + %tmp12 = or i64 %tmp7, %tmp11 + %tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3 + %tmp14 = load i8, i8* %tmp13, align 1 + %tmp15 = zext i8 %tmp14 to i64 + %tmp16 = shl nuw nsw i64 %tmp15, 24 + %tmp17 = or i64 %tmp12, %tmp16 + %tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4 + %tmp19 = load i8, i8* %tmp18, align 1 + %tmp20 = zext i8 %tmp19 to i64 + %tmp21 = shl nuw nsw i64 %tmp20, 32 + %tmp22 = or i64 %tmp17, %tmp21 + %tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5 + %tmp24 = load i8, i8* %tmp23, align 1 + %tmp25 = zext i8 %tmp24 to i64 + %tmp26 = shl nuw nsw i64 %tmp25, 40 + %tmp27 = or i64 %tmp22, %tmp26 + %tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6 + %tmp29 = load i8, i8* %tmp28, align 1 + %tmp30 = zext i8 %tmp29 to i64 + %tmp31 = shl nuw nsw i64 %tmp30, 48 + %tmp32 = or i64 %tmp27, %tmp31 + %tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7 + %tmp34 = load i8, i8* %tmp33, align 1 + %tmp35 = zext i8 %tmp34 to i64 + %tmp36 = shl nuw i64 %tmp35, 56 + %tmp37 = or i64 %tmp32, %tmp36 + ret i64 %tmp37 +} + +; i8* p; // p is 8 byte aligned +; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] +define i64 @load_i64_by_i8_bswap(i64* %arg) { +; CHECK-LABEL: load_i64_by_i8_bswap: +; CHECK: ldr{{.*}}r0 +; CHECK: ldr{{.*}}r0 +; CHECK: and +; CHECK-NEXT: and +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK-NEXT: and +; CHECK-NEXT: orr +; CHECK-NEXT: and +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK-NEXT: orr +; CHECK: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: +; CHECK-ARMv6: ldrd r2, r3, [r0] +; CHECK-ARMv6: rev r0, r3 +; CHECK-ARMv6: rev r1, r2 +; CHECK-ARMv6: bx lr + %tmp = bitcast i64* %arg to i8* + %tmp1 = load i8, i8* %tmp, align 8 + %tmp2 = zext i8 %tmp1 to i64 + %tmp3 = shl nuw i64 %tmp2, 56 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i64 + %tmp7 = shl nuw nsw i64 %tmp6, 48 + %tmp8 = or i64 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i64 + %tmp12 = shl nuw nsw i64 %tmp11, 40 + %tmp13 = or i64 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i64 + %tmp17 = shl nuw nsw i64 %tmp16, 32 + %tmp18 = or i64 %tmp13, %tmp17 + %tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4 + %tmp20 = load i8, i8* %tmp19, align 1 + %tmp21 = zext i8 %tmp20 to i64 + %tmp22 = shl nuw nsw i64 %tmp21, 24 + %tmp23 = or i64 %tmp18, %tmp22 + %tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5 + %tmp25 = load i8, i8* %tmp24, align 1 + %tmp26 = zext i8 %tmp25 to i64 + %tmp27 = shl nuw nsw i64 %tmp26, 16 + %tmp28 = or i64 %tmp23, %tmp27 + %tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6 + %tmp30 = load i8, i8* %tmp29, align 1 + %tmp31 = zext i8 %tmp30 to i64 + %tmp32 = shl nuw nsw i64 %tmp31, 8 + %tmp33 = or i64 %tmp28, %tmp32 + %tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7 + %tmp35 = load i8, i8* %tmp34, align 1 + %tmp36 = zext i8 %tmp35 to i64 + %tmp37 = or i64 %tmp33, %tmp36 + ret i64 %tmp37 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +declare i16 @llvm.bswap.i16(i16) + +; i16* p; // p is 4 byte aligned +; (i32) bswap(p[1]) | (i32) bswap(p[0] << 16) +define i32 @load_i32_by_bswap_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_bswap_i16: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1) + %tmp2 = zext i16 %tmp11 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp41 = call i16 @llvm.bswap.i16(i16 %tmp4) + %tmp5 = zext i16 %tmp41 to i32 + %tmp6 = shl nuw nsw i32 %tmp2, 16 + %tmp7 = or i32 %tmp6, %tmp5 + ret i32 %tmp7 +} + +; i16* p; +; (i32) p[0] | (sext(p[1] << 16) to i32) +define i32 @load_i32_by_sext_i16(i32* %arg) { +; CHECK-LABEL: load_i32_by_sext_i16: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: load_i32_by_sext_i16: +; CHECK-ARMv6: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i16* + %tmp1 = load i16, i16* %tmp, align 4 + %tmp2 = zext i16 %tmp1 to i32 + %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1 + %tmp4 = load i16, i16* %tmp3, align 1 + %tmp5 = sext i16 %tmp4 to i32 + %tmp6 = shl nuw nsw i32 %tmp5, 16 + %tmp7 = or i32 %tmp6, %tmp2 + ret i32 %tmp7 +} + +; i8* arg; i32 i; +; p = arg + 12; +; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24) +define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { +; CHECK-LABEL: load_i32_by_i8_base_offset_index: +; CHECK: add r0, r0, r1 +; CHECK-NEXT: ldr r0, [r0, #12] +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: +; CHECK-ARMv6: add r0, r0, r1 +; CHECK-ARMv6-NEXT: ldr r0, [r0, #12] +; CHECK-ARMv6-NEXT: bx lr + + %tmp = add nuw nsw i32 %i, 3 + %tmp2 = add nuw nsw i32 %i, 2 + %tmp3 = add nuw nsw i32 %i, 1 + %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12 + %tmp5 = zext i32 %i to i64 + %tmp6 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp5 + %tmp7 = load i8, i8* %tmp6, align 4 + %tmp8 = zext i8 %tmp7 to i32 + %tmp9 = zext i32 %tmp3 to i64 + %tmp10 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp9 + %tmp11 = load i8, i8* %tmp10, align 1 + %tmp12 = zext i8 %tmp11 to i32 + %tmp13 = shl nuw nsw i32 %tmp12, 8 + %tmp14 = or i32 %tmp13, %tmp8 + %tmp15 = zext i32 %tmp2 to i64 + %tmp16 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp15 + %tmp17 = load i8, i8* %tmp16, align 1 + %tmp18 = zext i8 %tmp17 to i32 + %tmp19 = shl nuw nsw i32 %tmp18, 16 + %tmp20 = or i32 %tmp14, %tmp19 + %tmp21 = zext i32 %tmp to i64 + %tmp22 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp21 + %tmp23 = load i8, i8* %tmp22, align 1 + %tmp24 = zext i8 %tmp23 to i32 + %tmp25 = shl nuw i32 %tmp24, 24 + %tmp26 = or i32 %tmp20, %tmp25 + ret i32 %tmp26 +} + +; i8* arg; i32 i; +; p = arg + 12; +; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) +define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { +; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: +; CHECK: add r0, r0, r1 +; CHECK-NEXT: ldr r0, [r0, #13] +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: +; CHECK-ARMv6: add r0, r0, r1 +; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] +; CHECK-ARMv6-NEXT: bx lr + %tmp = add nuw nsw i32 %i, 4 + %tmp2 = add nuw nsw i32 %i, 3 + %tmp3 = add nuw nsw i32 %i, 2 + %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12 + %tmp5 = add nuw nsw i32 %i, 1 + %tmp27 = zext i32 %tmp5 to i64 + %tmp28 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp27 + %tmp29 = load i8, i8* %tmp28, align 4 + %tmp30 = zext i8 %tmp29 to i32 + %tmp31 = zext i32 %tmp3 to i64 + %tmp32 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp31 + %tmp33 = load i8, i8* %tmp32, align 1 + %tmp34 = zext i8 %tmp33 to i32 + %tmp35 = shl nuw nsw i32 %tmp34, 8 + %tmp36 = or i32 %tmp35, %tmp30 + %tmp37 = zext i32 %tmp2 to i64 + %tmp38 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp37 + %tmp39 = load i8, i8* %tmp38, align 1 + %tmp40 = zext i8 %tmp39 to i32 + %tmp41 = shl nuw nsw i32 %tmp40, 16 + %tmp42 = or i32 %tmp36, %tmp41 + %tmp43 = zext i32 %tmp to i64 + %tmp44 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp43 + %tmp45 = load i8, i8* %tmp44, align 1 + %tmp46 = zext i8 %tmp45 to i32 + %tmp47 = shl nuw i32 %tmp46, 24 + %tmp48 = or i32 %tmp42, %tmp47 + ret i32 %tmp48 +} + +; i8* p; // p is 2 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) +define i32 @zext_load_i32_by_i8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[0] << 8) | ((i32) p[1] << 16) +define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_shl_8: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r0, r0, #16 +; CHECK-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_8: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r0, r0, #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 8 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[0] << 16) | ((i32) p[1] << 24) +define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_shl_16: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_16: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r0, r0, #24 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp2 = load i8, i8* %tmp1, align 2 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 16 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 24 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; (i32) p[1] | ((i32) p[0] << 8) +define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[1] << 8) | ((i32) p[0] << 16) +define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r1, r1, #16 +; CHECK-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_8: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r1, r1, #16 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 8 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 16 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} + +; i8* p; // p is 2 byte aligned +; ((i32) p[1] << 16) | ((i32) p[0] << 24) +define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) { +; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16: +; CHECK: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r1, r1, #24 +; CHECK-NEXT: orr r0, r1, r0, lsl #16 +; CHECK-NEXT: mov pc, lr +; +; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_16: +; CHECK-ARMv6: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r1, r1, #24 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #16 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp30 = shl nuw nsw i32 %tmp3, 16 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0 + %tmp5 = load i8, i8* %tmp4, align 2 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 24 + %tmp8 = or i32 %tmp7, %tmp30 + ret i32 %tmp8 +} diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll index 80cb5096c03c..9ecda8b06cbf 100644 --- a/test/CodeGen/ARM/longMAC.ll +++ b/test/CodeGen/ARM/longMAC.ll @@ -1,14 +1,15 @@ ; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -check-prefix=CHECK --check-prefix=CHECK-LE -; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s --check-prefix=CHECK-V7-LE +; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK --check-prefix=CHECK-V7-LE ; RUN: llc -mtriple=armeb-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE -; RUN: llc -mtriple=armebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-BE -; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6-THUMB -; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6-THUMB2 -; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-THUMB -; RUN: llc -mtriple=thumbebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-THUMB-BE -; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6M-THUMB -; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7M-THUMB -; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7EM-THUMB +; RUN: llc -mtriple=armebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V7-BE +; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V6-THUMB +; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-T2-DSP +; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-T2-DSP +; RUN: llc -mtriple=thumbebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V7-THUMB-BE +; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V6M-THUMB +; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V7M-THUMB +; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-T2-DSP +; RUN: llc -mtriple=armv5te-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V5TE ; Check generated signed and unsigned multiply accumulate long. define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) { @@ -20,12 +21,9 @@ define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) { ;CHECK-BE: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-BE: mov r0, [[RDHI]] ;CHECK-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] +;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]] +;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]] ;CHECK-V7-THUMB-BE: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] @@ -44,12 +42,9 @@ define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c) { ;CHECK-BE: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-BE: mov r0, [[RDHI]] ;CHECK-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] +;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]] +;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]] ;CHECK-V7-THUMB-BE: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] @@ -78,8 +73,7 @@ define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) { ;CHECK-BE: umlal [[RDLO:r[0-9]+]], [[RDHI]], r1, r0 ;CHECK-BE: mov r0, [[RDHI]] ;CHECK-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umlal -;CHECK-V7-THUMB: umlal +;CHECK-T2-DSP: umlal ;CHECK-V6-THUMB-NOT: umlal %conv = zext i32 %b to i64 %conv1 = zext i32 %a to i64 @@ -92,8 +86,7 @@ define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) { define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) { ;CHECK-LABEL: MACLongTest4: ;CHECK-V6-THUMB-NOT: smlal -;CHECK-V6-THUMB2: smlal -;CHECK-V7-THUMB: smlal +;CHECK-T2-DSP: smlal ;CHECK-LE: asr [[RDHI:r[0-9]+]], [[RDLO:r[0-9]+]], #31 ;CHECK-LE: smlal [[RDLO]], [[RDHI]], r1, r0 ;CHECK-LE: mov r0, [[RDLO]] @@ -114,14 +107,12 @@ define i64 @MACLongTest6(i32 %a, i32 %b, i32 %c, i32 %d) { ;CHECK-LABEL: MACLongTest6: ;CHECK-V6-THUMB-NOT: smull ;CHECK-V6-THUMB-NOT: smlal -;CHECK: smull r12, lr, r1, r0 -;CHECK: smlal r12, lr, r3, r2 +;CHECK-LE: smull r12, lr, r1, r0 +;CHECK-LE: smlal r12, lr, r3, r2 ;CHECK-V7: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 ;CHECK-V7: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] -;CHECK-V7-THUMB: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 -;CHECK-V7-THUMB: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] -;CHECK-V6-THUMB2: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 -;CHECK-V6-THUMB2: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] +;CHECK-T2-DSP: smull [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0 +;CHECK-T2-DSP: smlal [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]] %conv = sext i32 %a to i64 %conv1 = sext i32 %b to i64 %mul = mul nsw i64 %conv1, %conv @@ -172,18 +163,12 @@ define i64 @MACLongTest9(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) { ;CHECK-V7-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-BE: mov r0, [[RDHI]] ;CHECK-V7-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] +;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]] +;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]] ;CHECK-V7-THUMB-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] -;CHECK-V7EM-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7EM-THUMB: mov r0, [[RDLO]] -;CHECK-V7EM-THUMB: mov r1, [[RDHI]] ;CHECK-NOT:umaal ;CHECK-V6-THUMB-NOT: umaal ;CHECK-V6M-THUMB-NOT: umaal @@ -206,18 +191,12 @@ define i64 @MACLongTest10(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) { ;CHECK-V7-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-BE: mov r0, [[RDHI]] ;CHECK-V7-BE: mov r1, [[RDLO]] -;CHECK-V6-THUMB2: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V6-THUMB2: mov r0, [[RDLO]] -;CHECK-V6-THUMB2: mov r1, [[RDHI]] -;CHECK-V7-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7-THUMB: mov r0, [[RDLO]] -;CHECK-V7-THUMB: mov r1, [[RDHI]] +;CHECK-T2-DSP: umaal r2, r3, r1, r0 +;CHECK-T2-DSP-NEXT: mov r0, r2 +;CHECK-T2-DSP-NEXT: mov r1, r3 ;CHECK-V7-THUMB-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]] ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]] -;CHECK-V7EM-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]] -;CHECK-V7EM-THUMB: mov r0, [[RDLO]] -;CHECK-V7EM-THUMB: mov r1, [[RDHI]] ;CHECK-NOT:umaal ;CHECK-V6-THUMB-NOT:umaal ;CHECK-V6M-THUMB-NOT: umaal @@ -231,3 +210,188 @@ define i64 @MACLongTest10(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) { %add2 = add i64 %add, %mul ret i64 %add2 } + +define i64 @MACLongTest11(i16 %a, i16 %b, i64 %c) { +;CHECK-LABEL: MACLongTest11: +;CHECK-T2-DSP-NOT: sxth +;CHECK-T2-DSP: smlalbb r2, r3 +;CHECK-T2-DSP-NEXT: mov r0, r2 +;CHECK-T2-DSP-NEXT: mov r1, r3 +;CHECK-V5TE-NOT: sxth +;CHECK-V5TE: smlalbb r2, r3 +;CHECK-V5TE-NEXT: mov r0, r2 +;CHECK-V5TE-NEXT: mov r1, r3 +;CHECK-V7-LE-NOT: sxth +;CHECK-V7-LE: smlalbb r2, r3 +;CHECK-V7-LE-NEXT: mov r0, r2 +;CHECK-V7-LE-NEXT: mov r1, r3 +;CHECK-V7-THUMB-BE: smlalbb r3, r2 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r2 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r3 +;CHECK-LE-NOT: smlalbb +;CHECK-BE-NOT: smlalbb +;CHECK-V6M-THUMB-NOT: smlalbb +;CHECK-V7M-THUMB-NOT: smlalbb + %conv = sext i16 %a to i32 + %conv1 = sext i16 %b to i32 + %mul = mul nsw i32 %conv1, %conv + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +define i64 @MACLongTest12(i16 %b, i32 %t, i64 %c) { +;CHECK-LABEL: MACLongTest12: +;CHECK-T2-DSP-NOT: sxth +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlalbt r2, r3, r0, r1 +;CHECK-T2-DSP-NEXT: mov r0, r2 +;CHECK-T2-DSP-NEXT: mov r1, r3 +;CHECK-T2-DSP-NOT: sxth +;CHECK-V5TE-NOT: sxth +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlalbt r2, r3, r0, r1 +;CHECK-V5TE-NEXT: mov r0, r2 +;CHECK-V5TE-NEXT: mov r1, r3 +;CHECK-V7-LE-NOT: sxth +;CHECK-V7-LE-NOT: {{asr|lsr}} +;CHECK-V7-LE: smlalbt r2, r3, r0, r1 +;CHECK-V7-LE-NEXT: mov r0, r2 +;CHECK-V7-LE-NEXT: mov r1, r3 +;CHECK-V7-THUMB-BE: smlalbt r3, r2, +;CHECK-V7-THUMB-BE-NEXT: mov r0, r2 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r3 +;CHECK-LE-NOT: smlalbt +;CHECK-BE-NOT: smlalbt +;CHECK-V6M-THUMB-NOT: smlalbt +;CHECK-V7M-THUMB-NOT: smlalbt + %conv0 = sext i16 %b to i32 + %conv1 = ashr i32 %t, 16 + %mul = mul nsw i32 %conv0, %conv1 + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +define i64 @MACLongTest13(i32 %t, i16 %b, i64 %c) { +;CHECK-LABEL: MACLongTest13: +;CHECK-T2-DSP-NOT: sxth +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlaltb r2, r3, r0, r1 +;CHECK-T2-DSP-NEXT: mov r0, r2 +;CHECK-T2-DSP-NEXT: mov r1, r3 +;CHECK-V5TE-NOT: sxth +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlaltb r2, r3, r0, r1 +;CHECK-V5TE-NEXT: mov r0, r2 +;CHECK-V5TE-NEXT: mov r1, r3 +;CHECK-V7-LE-NOT: sxth +;CHECK-V7-LE-NOT: {{asr|lsr}} +;CHECK-V7-LE: smlaltb r2, r3, r0, r1 +;CHECK-V7-LE-NEXT: mov r0, r2 +;CHECK-V7-LE-NEXT: mov r1, r3 +;CHECK-V7-THUMB-BE: smlaltb r3, r2, r0, r1 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r2 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r3 +;CHECK-LE-NOT: smlaltb +;CHECK-BE-NOT: smlaltb +;CHECK-V6M-THUMB-NOT: smlaltb +;CHECK-V7M-THUMB-NOT: smlaltb + %conv0 = ashr i32 %t, 16 + %conv1= sext i16 %b to i32 + %mul = mul nsw i32 %conv0, %conv1 + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +define i64 @MACLongTest14(i32 %a, i32 %b, i64 %c) { +;CHECK-LABEL: MACLongTest14: +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlaltt r2, r3, +;CHECK-T2-DSP-NEXT: mov r0, r2 +;CHECK-T2-DSP-NEXT: mov r1, r3 +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlaltt r2, r3, +;CHECK-V5TE-NEXT: mov r0, r2 +;CHECK-V5TE-NEXT: mov r1, r3 +;CHECK-V7-LE-NOT: {{asr|lsr}} +;CHECK-V7-LE: smlaltt r2, r3, +;CHECK-V7-LE-NEXT: mov r0, r2 +;CHECK-V7-LE-NEXT: mov r1, r3 +;CHECK-V7-THUMB-BE: smlaltt r3, r2, +;CHECK-V7-THUMB-BE-NEXT: mov r0, r2 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r3 +;CHECK-LE-NOT: smlaltt +;CHECK-BE-NOT: smlaltt +;CHECK-V6M-THUMB-NOT: smlaltt +;CHECK-V7M-THUMB-NOT: smlaltt + %conv0 = ashr i32 %a, 16 + %conv1 = ashr i32 %b, 16 + %mul = mul nsw i32 %conv1, %conv0 + %conv2 = sext i32 %mul to i64 + %add = add nsw i64 %conv2, %c + ret i64 %add +} + +@global_b = external global i16, align 2 +;CHECK-LABEL: MACLongTest15 +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlaltb r2, r3, r0, r1 +;CHECK-T2-DSP-NEXT: mov r0, r2 +;CHECK-T2-DSP-NEXT: mov r1, r3 +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlaltb r2, r3, r0, r1 +;CHECK-V5TE-NEXT: mov r0, r2 +;CHECK-V5TE-NEXT: mov r1, r3 +;CHECK-V7-LE-NOT: {{asr|lsr}} +;CHECK-V7-LE: smlaltb r2, r3, r0, r1 +;CHECK-V7-LE-NEXT: mov r0, r2 +;CHECK-V7-LE-NEXT: mov r1, r3 +;CHECK-V7-THUMB-BE: smlaltb r3, r2, r0, r1 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r2 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r3 +;CHECK-LE-NOT: smlaltb +;CHECK-BE-NOT: smlaltb +;CHECK-V6M-THUMB-NOT: smlaltb +;CHECK-V7M-THUMB-NOT: smlaltb +define i64 @MACLongTest15(i32 %t, i64 %acc) { +entry: + %0 = load i16, i16* @global_b, align 2 + %conv = sext i16 %0 to i32 + %shr = ashr i32 %t, 16 + %mul = mul nsw i32 %shr, %conv + %conv1 = sext i32 %mul to i64 + %add = add nsw i64 %conv1, %acc + ret i64 %add +} + +;CHECK-LABEL: MACLongTest16 +;CHECK-T2-DSP-NOT: {{asr|lsr}} +;CHECK-T2-DSP: smlalbt r2, r3, r1, r0 +;CHECK-T2-DSP-NEXT: mov r0, r2 +;CHECK-T2-DSP-NEXT: mov r1, r3 +;CHECK-V5TE-NOT: {{asr|lsr}} +;CHECK-V5TE: smlalbt r2, r3, r1, r0 +;CHECK-V5TE-NEXT: mov r0, r2 +;CHECK-V5TE-NEXT: mov r1, r3 +;CHECK-V7-LE: smlalbt r2, r3, r1, r0 +;CHECK-V7-LE-NEXT: mov r0, r2 +;CHECK-V7-LE-NEXT: mov r1, r3 +;CHECK-V7-THUMB-BE: smlalbt r3, r2, r1, r0 +;CHECK-V7-THUMB-BE-NEXT: mov r0, r2 +;CHECK-V7-THUMB-BE-NEXT: mov r1, r3 +;CHECK-LE-NOT: smlalbt +;CHECK-BE-NOT: smlalbt +;CHECK-V6M-THUMB-NOT: smlalbt +;CHECK-V7M-THUMB-NOT: smlalbt +define i64 @MACLongTest16(i32 %t, i64 %acc) { +entry: + %0 = load i16, i16* @global_b, align 2 + %conv = sext i16 %0 to i32 + %shr = ashr i32 %t, 16 + %mul = mul nsw i32 %conv, %shr + %conv1 = sext i32 %mul to i64 + %add = add nsw i64 %conv1, %acc + ret i64 %add +} diff --git a/test/CodeGen/ARM/lowerMUL-newload.ll b/test/CodeGen/ARM/lowerMUL-newload.ll new file mode 100644 index 000000000000..93d765cba116 --- /dev/null +++ b/test/CodeGen/ARM/lowerMUL-newload.ll @@ -0,0 +1,115 @@ +; RUN: llc < %s -mtriple=arm-eabi -mcpu=krait | FileCheck %s + +define void @func1(i16* %a, i16* %b, i16* %c) { +entry: +; The test case trying to vectorize the pseudo code below. +; a[i] = b[i] + c[i]; +; b[i] = a[i] * c[i]; +; a[i] = b[i] + a[i] * c[i]; +; +; Checking that vector load a[i] for "a[i] = b[i] + a[i] * c[i]" is +; scheduled before the first vector store to "a[i] = b[i] + c[i]". +; Checking that there is no vector load a[i] scheduled between the vector +; stores to a[i], otherwise the load of a[i] will be polluted by the first +; vector store to a[i]. +; +; This test case check that the chain information is updated during +; lowerMUL for the new created Load SDNode. + +; CHECK: vldr {{.*}} [r0, #16] +; CHECK: vstr {{.*}} [r0, #16] +; CHECK-NOT: vldr {{.*}} [r0, #16] +; CHECK: vstr {{.*}} [r0, #16] + + %scevgep0 = getelementptr i16, i16* %a, i32 8 + %vector_ptr0 = bitcast i16* %scevgep0 to <4 x i16>* + %vec0 = load <4 x i16>, <4 x i16>* %vector_ptr0, align 8 + %scevgep1 = getelementptr i16, i16* %b, i32 8 + %vector_ptr1 = bitcast i16* %scevgep1 to <4 x i16>* + %vec1 = load <4 x i16>, <4 x i16>* %vector_ptr1, align 8 + %0 = zext <4 x i16> %vec1 to <4 x i32> + %scevgep2 = getelementptr i16, i16* %c, i32 8 + %vector_ptr2 = bitcast i16* %scevgep2 to <4 x i16>* + %vec2 = load <4 x i16>, <4 x i16>* %vector_ptr2, align 8 + %1 = sext <4 x i16> %vec2 to <4 x i32> + %vec3 = add <4 x i32> %1, %0 + %2 = trunc <4 x i32> %vec3 to <4 x i16> + %scevgep3 = getelementptr i16, i16* %a, i32 8 + %vector_ptr3 = bitcast i16* %scevgep3 to <4 x i16>* + store <4 x i16> %2, <4 x i16>* %vector_ptr3, align 8 + %vector_ptr4 = bitcast i16* %scevgep2 to <4 x i16>* + %vec4 = load <4 x i16>, <4 x i16>* %vector_ptr4, align 8 + %3 = sext <4 x i16> %vec4 to <4 x i32> + %vec5 = mul <4 x i32> %3, %vec3 + %4 = trunc <4 x i32> %vec5 to <4 x i16> + %vector_ptr5 = bitcast i16* %scevgep1 to <4 x i16>* + store <4 x i16> %4, <4 x i16>* %vector_ptr5, align 8 + %5 = sext <4 x i16> %vec0 to <4 x i32> + %vector_ptr6 = bitcast i16* %scevgep2 to <4 x i16>* + %vec6 = load <4 x i16>, <4 x i16>* %vector_ptr6, align 8 + %6 = sext <4 x i16> %vec6 to <4 x i32> + %vec7 = mul <4 x i32> %6, %5 + %vec8 = add <4 x i32> %vec7, %vec5 + %7 = trunc <4 x i32> %vec8 to <4 x i16> + %vector_ptr7 = bitcast i16* %scevgep3 to <4 x i16>* + store <4 x i16> %7, <4 x i16>* %vector_ptr7, align 8 + ret void +} + +define void @func2(i16* %a, i16* %b, i16* %c) { +entry: +; The test case trying to vectorize the pseudo code below. +; a[i] = b[i] + c[i]; +; b[i] = a[i] * c[i]; +; a[i] = b[i] + a[i] * c[i] + a[i]; +; +; Checking that vector load a[i] for "a[i] = b[i] + a[i] * c[i] + a[i]" +; is scheduled before the first vector store to "a[i] = b[i] + c[i]". +; Checking that there is no vector load a[i] scheduled between the first +; vector store to a[i] and the vector add of a[i], otherwise the load of +; a[i] will be polluted by the first vector store to a[i]. +; +; This test case check that both the chain and value of the new created +; Load SDNode are updated during lowerMUL. + +; CHECK: vldr {{.*}} [r0, #16] +; CHECK: vstr {{.*}} [r0, #16] +; CHECK-NOT: vldr {{.*}} [r0, #16] +; CHECK: vaddw.s16 +; CHECK: vstr {{.*}} [r0, #16] + + %scevgep0 = getelementptr i16, i16* %a, i32 8 + %vector_ptr0 = bitcast i16* %scevgep0 to <4 x i16>* + %vec0 = load <4 x i16>, <4 x i16>* %vector_ptr0, align 8 + %scevgep1 = getelementptr i16, i16* %b, i32 8 + %vector_ptr1 = bitcast i16* %scevgep1 to <4 x i16>* + %vec1 = load <4 x i16>, <4 x i16>* %vector_ptr1, align 8 + %0 = zext <4 x i16> %vec1 to <4 x i32> + %scevgep2 = getelementptr i16, i16* %c, i32 8 + %vector_ptr2 = bitcast i16* %scevgep2 to <4 x i16>* + %vec2 = load <4 x i16>, <4 x i16>* %vector_ptr2, align 8 + %1 = sext <4 x i16> %vec2 to <4 x i32> + %vec3 = add <4 x i32> %1, %0 + %2 = trunc <4 x i32> %vec3 to <4 x i16> + %scevgep3 = getelementptr i16, i16* %a, i32 8 + %vector_ptr3 = bitcast i16* %scevgep3 to <4 x i16>* + store <4 x i16> %2, <4 x i16>* %vector_ptr3, align 8 + %vector_ptr4 = bitcast i16* %scevgep2 to <4 x i16>* + %vec4 = load <4 x i16>, <4 x i16>* %vector_ptr4, align 8 + %3 = sext <4 x i16> %vec4 to <4 x i32> + %vec5 = mul <4 x i32> %3, %vec3 + %4 = trunc <4 x i32> %vec5 to <4 x i16> + %vector_ptr5 = bitcast i16* %scevgep1 to <4 x i16>* + store <4 x i16> %4, <4 x i16>* %vector_ptr5, align 8 + %5 = sext <4 x i16> %vec0 to <4 x i32> + %vector_ptr6 = bitcast i16* %scevgep2 to <4 x i16>* + %vec6 = load <4 x i16>, <4 x i16>* %vector_ptr6, align 8 + %6 = sext <4 x i16> %vec6 to <4 x i32> + %vec7 = mul <4 x i32> %6, %5 + %vec8 = add <4 x i32> %vec7, %vec5 + %vec9 = add <4 x i32> %vec8, %5 + %7 = trunc <4 x i32> %vec9 to <4 x i16> + %vector_ptr7 = bitcast i16* %scevgep3 to <4 x i16>* + store <4 x i16> %7, <4 x i16>* %vector_ptr7, align 8 + ret void +} diff --git a/test/CodeGen/ARM/mature-mc-support.ll b/test/CodeGen/ARM/mature-mc-support.ll index 0a7e5b91adc5..f89657dd81ac 100644 --- a/test/CodeGen/ARM/mature-mc-support.ll +++ b/test/CodeGen/ARM/mature-mc-support.ll @@ -9,4 +9,4 @@ module asm " .this_directive_is_very_unlikely_to_exist" -; CHECK: LLVM ERROR: Error parsing inline asm +; CHECK: error: unknown directive diff --git a/test/CodeGen/ARM/misched-fp-basic.ll b/test/CodeGen/ARM/misched-fp-basic.ll new file mode 100644 index 000000000000..27ad2cec34fd --- /dev/null +++ b/test/CodeGen/ARM/misched-fp-basic.ll @@ -0,0 +1,69 @@ +; REQUIRES: asserts +; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a9 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \ +; RUN: /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9 +; RUN: llc < %s -mtriple=arm-eabi -mcpu=swift -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \ +; RUN: /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT +; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \ +; RUN: /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52 +; +; Check the latency of instructions for processors with sched-models +; +; Function Attrs: norecurse nounwind readnone +define i32 @foo(float %a, float %b, float %c, i32 %d) local_unnamed_addr #0 { +entry: +; +; CHECK: ********** MI Scheduling ********** +; CHECK_A9: VADDS +; CHECK_SWIFT: VADDfd +; CHECK_R52: VADDS +; CHECK_A9: Latency : 5 +; CHECK_SWIFT: Latency : 4 +; CHECK_R52: Latency : 6 +; +; CHECK_A9: VMULS +; CHECK_SWIFT: VMULfd +; CHECK_R52: VMULS +; CHECK_SWIFT: Latency : 4 +; CHECK_A9: Latency : 6 +; CHECK_R52: Latency : 6 +; +; CHECK: VDIVS +; CHECK_SWIFT: Latency : 17 +; CHECK_A9: Latency : 16 +; CHECK_R52: Latency : 7 +; +; CHECK: VCVTDS +; CHECK_SWIFT: Latency : 4 +; CHECK_A9: Latency : 5 +; CHECK_R52: Latency : 6 +; +; CHECK: VADDD +; CHECK_SWIFT: Latency : 6 +; CHECK_A9: Latency : 5 +; CHECK_R52: Latency : 6 +; +; CHECK: VMULD +; CHECK_SWIFT: Latency : 6 +; CHECK_A9: Latency : 7 +; CHECK_R52: Latency : 6 +; +; CHECK: VDIVD +; CHECK_SWIFT: Latency : 32 +; CHECK_A9: Latency : 26 +; CHECK_R52: Latency : 17 +; +; CHECK: VTOSIZD +; CHECK_SWIFT: Latency : 4 +; CHECK_A9: Latency : 5 +; CHECK_R52: Latency : 6 +; + %add = fadd float %a, %b + %mul = fmul float %add, %add + %div = fdiv float %mul, %b + %conv1 = fpext float %div to double + %add3 = fadd double %conv1, %conv1 + %mul4 = fmul double %add3, %add3 + %div5 = fdiv double %mul4, %conv1 + %conv6 = fptosi double %div5 to i32 + ret i32 %conv6 +} diff --git a/test/CodeGen/ARM/misched-int-basic-thumb2.mir b/test/CodeGen/ARM/misched-int-basic-thumb2.mir new file mode 100644 index 000000000000..86ef1e26f636 --- /dev/null +++ b/test/CodeGen/ARM/misched-int-basic-thumb2.mir @@ -0,0 +1,175 @@ +# Basic machine sched model test for Thumb2 int instructions +# RUN: llc -o /dev/null %s -mtriple=thumbv7-eabi -mcpu=swift -run-pass machine-scheduler -enable-misched -verify-misched \ +# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT +# RUN: llc -o /dev/null %s -mtriple=thumbv7--eabi -mcpu=cortex-a9 -run-pass machine-scheduler -enable-misched -verify-misched \ +# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9 +# RUN: llc -o /dev/null %s -mtriple=thumbv8r-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -verify-misched \ +# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52 +# REQUIRES: asserts +--- | + ; ModuleID = 'foo.ll' + source_filename = "foo.ll" + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv7---eabi" + + @g1 = common global i32 0, align 4 + @g2 = common global i32 0, align 4 + + define i64 @foo(i16 signext %a, i16 signext %b) { + entry: + %0 = load i32, i32* @g1, align 4 + %1 = load i32, i32* @g2, align 4 + %2 = add nuw nsw i32 %0, %0 + %3 = sdiv i32 %2, %1 + store i32 %3, i32* @g1, align 4 + %d = mul nsw i16 %a, %a + %e = mul nsw i16 %b, %b + %f = add nuw nsw i16 %e, %d + %c = zext i16 %f to i32 + %mul8 = mul nsw i32 %c, %3 + %mul9 = mul nsw i32 %mul8, %mul8 + %add10 = add nuw nsw i32 %mul9, %mul8 + %conv1130 = zext i32 %add10 to i64 + %mul12 = mul nuw nsw i64 %conv1130, %conv1130 + %mul13 = mul nsw i64 %mul12, %mul12 + %add14 = add nuw nsw i64 %mul13, %mul12 + ret i64 %add14 + } +# +# CHECK: ********** MI Scheduling ********** +# CHECK: SU(2): %vreg2 = t2MOVi32imm ; rGPR:%vreg2 +# CHECK_A9: Latency : 2 +# CHECK_SWIFT: Latency : 2 +# CHECK_R52: Latency : 2 +# +# CHECK: SU(3): %vreg3 = t2LDRi12 %vreg2, 0, pred:14, pred:%noreg; mem:LD4[@g1](dereferenceable) rGPR:%vreg3,%vreg2 +# CHECK_A9: Latency : 1 +# CHECK_SWIFT: Latency : 3 +# CHECK_R52: Latency : 4 +# +# CHECK : SU(6): %vreg6 = t2ADDrr %vreg3, %vreg3, pred:14, pred:%noreg, opt:%noreg; rGPR:%vreg6,%vreg3,%vreg3 +# CHECK_A9: Latency : 1 +# CHECK_SWIFT: Latency : 1 +# CHECK_R52: Latency : 3 + +# CHECK: SU(7): %vreg7 = t2SDIV %vreg6, %vreg5, pred:14, pred:%noreg; rGPR:%vreg7,%vreg6,%vreg5 +# CHECK_A9: Latency : 0 +# CHECK_SWIFT: Latency : 14 +# CHECK_R52: Latency : 8 + +# CHECK: SU(8): t2STRi12 %vreg7, %vreg2, 0, pred:14, pred:%noreg; mem:ST4[@g1] rGPR:%vreg7,%vreg2 +# CHECK_A9: Latency : 1 +# CHECK_SWIFT: Latency : 0 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(9): %vreg8 = t2SMULBB %vreg1, %vreg1, pred:14, pred:%noreg; rGPR:%vreg8,%vreg1,%vreg1 +# CHECK_A9: Latency : 2 +# CHECK_SWIFT: Latency : 4 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(10): %vreg9 = t2SMLABB %vreg0, %vreg0, %vreg8, pred:14, pred:%noreg; rGPR:%vreg9,%vreg0,%vreg0,%vreg8 +# CHECK_A9: Latency : 2 +# CHECK_SWIFT: Latency : 4 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(11): %vreg10 = t2UXTH %vreg9, 0, pred:14, pred:%noreg; rGPR:%vreg10,%vreg9 +# CHECK_A9: Latency : 1 +# CHECK_SWIFT: Latency : 1 +# CHECK_R52: Latency : 3 +# +# CHECK: SU(12): %vreg11 = t2MUL %vreg10, %vreg7, pred:14, pred:%noreg; rGPR:%vreg11,%vreg10,%vreg7 +# CHECK_A9: Latency : 2 +# CHECK_SWIFT: Latency : 4 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(13): %vreg12 = t2MLA %vreg11, %vreg11, %vreg11, pred:14, pred:%noreg; rGPR:%vreg12,%vreg11,%vreg11,%vreg11 +# CHECK_A9: Latency : 2 +# CHECK_SWIFT: Latency : 4 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(14): %vreg13, %vreg14 = t2UMULL %vreg12, %vreg12, pred:14, pred:%noreg; rGPR:%vreg13,%vreg14,%vreg12,%vreg12 +# CHECK_A9: Latency : 3 +# CHECK_SWIFT: Latency : 5 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(18): %vreg19, %vreg20 = t2UMLAL %vreg12, %vreg12, %vreg19, %vreg20, pred:14, pred:%noreg; rGPR:%vreg19,%vreg20,%vreg12,%vreg12,%vreg20 +# CHECK_A9: Latency : 3 +# CHECK_SWIFT: Latency : 7 +# CHECK_R52: Latency : 4 +# CHECK: ** ScheduleDAGMILive::schedule picking next node +... +--- +name: foo +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: rgpr } + - { id: 1, class: rgpr } + - { id: 2, class: rgpr } + - { id: 3, class: rgpr } + - { id: 4, class: rgpr } + - { id: 5, class: rgpr } + - { id: 6, class: rgpr } + - { id: 7, class: rgpr } + - { id: 8, class: rgpr } + - { id: 9, class: rgpr } + - { id: 10, class: rgpr } + - { id: 11, class: rgpr } + - { id: 12, class: rgpr } + - { id: 13, class: rgpr } + - { id: 14, class: rgpr } + - { id: 15, class: rgpr } + - { id: 16, class: rgpr } + - { id: 17, class: rgpr } + - { id: 18, class: rgpr } + - { id: 19, class: rgpr } + - { id: 20, class: rgpr } +liveins: + - { reg: '%r0', virtual-reg: '%0' } + - { reg: '%r1', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %r0, %r1 + + %1 = COPY %r1 + %0 = COPY %r0 + %2 = t2MOVi32imm @g1 + %3 = t2LDRi12 %2, 0, 14, _ :: (dereferenceable load 4 from @g1) + %4 = t2MOVi32imm @g2 + %5 = t2LDRi12 %4, 0, 14, _ :: (dereferenceable load 4 from @g2) + %6 = t2ADDrr %3, %3, 14, _, _ + %7 = t2SDIV %6, %5, 14, _ + t2STRi12 %7, %2, 0, 14, _ :: (store 4 into @g1) + %8 = t2SMULBB %1, %1, 14, _ + %9 = t2SMLABB %0, %0, %8, 14, _ + %10 = t2UXTH %9, 0, 14, _ + %11 = t2MUL %10, %7, 14, _ + %12 = t2MLA %11, %11, %11, 14, _ + %13, %14 = t2UMULL %12, %12, 14, _ + %19, %16 = t2UMULL %13, %13, 14, _ + %17 = t2MLA %13, %14, %16, 14, _ + %20 = t2MLA %13, %14, %17, 14, _ + %19, %20 = t2UMLAL %12, %12, %19, %20, 14, _ + %r0 = COPY %19 + %r1 = COPY %20 + tBX_RET 14, _, implicit %r0, implicit %r1 + +... diff --git a/test/CodeGen/ARM/misched-int-basic.mir b/test/CodeGen/ARM/misched-int-basic.mir new file mode 100644 index 000000000000..f237c0a07b2e --- /dev/null +++ b/test/CodeGen/ARM/misched-int-basic.mir @@ -0,0 +1,128 @@ +# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=swift -run-pass machine-scheduler -enable-misched -verify-misched \ +# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT +# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-a9 -run-pass machine-scheduler -enable-misched -verify-misched \ +# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9 +# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -verify-misched \ +# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52 +# REQUIRES: asserts +--- | + ; ModuleID = 'foo.ll' + source_filename = "foo.ll" + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "arm---eabi" + + define i64 @foo(i16 signext %a, i16 signext %b) { + entry: + %d = mul nsw i16 %a, %a + %e = mul nsw i16 %b, %b + %f = add nuw nsw i16 %e, %d + %c = zext i16 %f to i32 + %mul8 = mul nsw i32 %c, %c + %mul9 = mul nsw i32 %mul8, %mul8 + %add10 = add nuw nsw i32 %mul9, %mul8 + %conv1130 = zext i32 %add10 to i64 + %mul12 = mul nuw nsw i64 %conv1130, %conv1130 + %mul13 = mul nsw i64 %mul12, %mul12 + %add14 = add nuw nsw i64 %mul13, %mul12 + ret i64 %add14 + } + +# CHECK: ********** MI Scheduling ********** +# CHECK: SU(2): %vreg2 = SMULBB %vreg1, %vreg1, pred:14, pred:%noreg; GPR:%vreg2,%vreg1,%vreg1 +# CHECK_A9: Latency : 2 +# CHECK_SWIFT: Latency : 4 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(3): %vreg3 = SMLABB %vreg0, %vreg0, %vreg2, pred:14, pred:%noreg; GPRnopc:%vreg3,%vreg0,%vreg0 GPR:%vreg2 +# CHECK_A9: Latency : 2 +# CHECK_SWIFT: Latency : 4 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(4): %vreg4 = UXTH %vreg3, 0, pred:14, pred:%noreg; GPRnopc:%vreg4,%vreg3 +# CHECK_A9: Latency : 1 +# CHECK_SWIFT: Latency : 1 +# CHECK_R52: Latency : 3 +# +# CHECK: SU(5): %vreg5 = MUL %vreg4, %vreg4, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%vreg5,%vreg4,%vreg4 +# CHECK_A9: Latency : 2 +# CHECK_SWIFT: Latency : 4 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(6): %vreg6 = MLA %vreg5, %vreg5, %vreg5, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%vreg6,%vreg5,%vreg5,%vreg5 +# CHECK_A9: Latency : 2 +# CHECK_SWIFT: Latency : 4 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(7): %vreg7, %vreg8 = UMULL %vreg6, %vreg6, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%vreg7,%vreg8,%vreg6,%vreg6 +# CHECK_A9: Latency : 3 +# CHECK_SWIFT: Latency : 5 +# CHECK_R52: Latency : 4 +# +# CHECK: SU(11): %vreg13, %vreg14 = UMLAL %vreg6, %vreg6, %vreg13, %vreg14, pred:14, pred:%noreg, opt:%noreg; GPR:%vreg13 GPRnopc:%vreg14,%vreg6,%vreg6 +# CHECK_SWIFT: Latency : 7 +# CHECK_A9: Latency : 3 +# CHECK_R52: Latency : 4 +# CHECK: ** ScheduleDAGMILive::schedule picking next node +... +--- +name: foo +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: gprnopc } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gprnopc } + - { id: 4, class: gprnopc } + - { id: 5, class: gprnopc } + - { id: 6, class: gprnopc } + - { id: 7, class: gprnopc } + - { id: 8, class: gprnopc } + - { id: 9, class: gpr } + - { id: 10, class: gprnopc } + - { id: 11, class: gprnopc } + - { id: 12, class: gprnopc } + - { id: 13, class: gpr } + - { id: 14, class: gprnopc } +liveins: + - { reg: '%r0', virtual-reg: '%0' } + - { reg: '%r1', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %r0, %r1 + + %1 = COPY %r1 + %0 = COPY %r0 + %2 = SMULBB %1, %1, 14, _ + %3 = SMLABB %0, %0, %2, 14, _ + %4 = UXTH %3, 0, 14, _ + %5 = MUL %4, %4, 14, _, _ + %6 = MLA %5, %5, %5, 14, _, _ + %7, %8 = UMULL %6, %6, 14, _, _ + %13, %10 = UMULL %7, %7, 14, _, _ + %11 = MLA %7, %8, %10, 14, _, _ + %14 = MLA %7, %8, %11, 14, _, _ + %13, %14 = UMLAL %6, %6, %13, %14, 14, _, _ + %r0 = COPY %13 + %r1 = COPY %14 + BX_RET 14, _, implicit %r0, implicit %r1 + +... diff --git a/test/CodeGen/ARM/movt.ll b/test/CodeGen/ARM/movt.ll index da9b698f2099..f51582031bd5 100644 --- a/test/CodeGen/ARM/movt.ll +++ b/test/CodeGen/ARM/movt.ll @@ -2,10 +2,15 @@ ; rdar://7317664 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8m.base -mcpu=cortex-m23 %s -o - | FileCheck %s --check-prefix=NOMOVT +; RUN: llc -mtriple=thumbv8m.base -mcpu=cortex-m33 %s -o - | FileCheck %s define i32 @t(i32 %X) nounwind { ; CHECK-LABEL: t: ; CHECK: movt r{{[0-9]}}, #65535 +; NOMOVT-LABEL: t: +; NOMOVT-NOT: movt r{{[0-9]}}, #65535 +; NOMOVT: ldr r{{[0-9]}}, .LCP entry: %0 = or i32 %X, -65536 ret i32 %0 @@ -14,6 +19,9 @@ entry: define i32 @t2(i32 %X) nounwind { ; CHECK-LABEL: t2: ; CHECK: movt r{{[0-9]}}, #65534 +; NOMOVT-LABEL: t2: +; NOMOVT-NOT: movt r{{[0-9]}}, #65534 +; NOMOVT: ldr r{{[0-9]}}, .LCP entry: %0 = or i32 %X, -131072 %1 = and i32 %0, -65537 diff --git a/test/CodeGen/ARM/msr-it-block.ll b/test/CodeGen/ARM/msr-it-block.ll index 0f9ff6b29d79..8d4ddc3a4985 100644 --- a/test/CodeGen/ARM/msr-it-block.ll +++ b/test/CodeGen/ARM/msr-it-block.ll @@ -20,8 +20,8 @@ write_reg: ; V6M: msr apsr, {{r[0-9]+}} ; V7M: msr apsr_nzcvq, {{r[0-9]+}} ; V7M: msr apsr_nzcvq, {{r[0-9]+}} -; V7A: msr APSR_nzcvqg, {{r[0-9]+}} -; V7A: msr APSR_nzcvqg, {{r[0-9]+}} +; V7A: msr APSR_nzcvq, {{r[0-9]+}} +; V7A: msr APSR_nzcvq, {{r[0-9]+}} br label %exit exit: @@ -41,8 +41,8 @@ write_reg: ; V6M: msr apsr, {{r[0-9]+}} ; V7M: msr apsr_nzcvq, {{r[0-9]+}} ; V7M: msr apsr_nzcvq, {{r[0-9]+}} -; V7A: msr APSR_nzcvqg, {{r[0-9]+}} -; V7A: msr APSR_nzcvqg, {{r[0-9]+}} +; V7A: msr APSR_nzcvq, {{r[0-9]+}} +; V7A: msr APSR_nzcvq, {{r[0-9]+}} br label %exit exit: diff --git a/test/CodeGen/ARM/neon_vabs.ll b/test/CodeGen/ARM/neon_vabs.ll index d32e7b78879b..109d09582afd 100644 --- a/test/CodeGen/ARM/neon_vabs.ll +++ b/test/CodeGen/ARM/neon_vabs.ll @@ -1,8 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s define <4 x i32> @test1(<4 x i32> %a) nounwind { ; CHECK-LABEL: test1: -; CHECK: vabs.s32 q +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <4 x i32> zeroinitializer, %a %b = icmp sgt <4 x i32> %a, %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg @@ -11,7 +18,13 @@ define <4 x i32> @test1(<4 x i32> %a) nounwind { define <4 x i32> @test2(<4 x i32> %a) nounwind { ; CHECK-LABEL: test2: -; CHECK: vabs.s32 q +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <4 x i32> zeroinitializer, %a %b = icmp sge <4 x i32> %a, zeroinitializer %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg @@ -20,7 +33,13 @@ define <4 x i32> @test2(<4 x i32> %a) nounwind { define <8 x i16> @test3(<8 x i16> %a) nounwind { ; CHECK-LABEL: test3: -; CHECK: vabs.s16 q +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s16 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <8 x i16> zeroinitializer, %a %b = icmp sgt <8 x i16> %a, zeroinitializer %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg @@ -29,7 +48,13 @@ define <8 x i16> @test3(<8 x i16> %a) nounwind { define <16 x i8> @test4(<16 x i8> %a) nounwind { ; CHECK-LABEL: test4: -; CHECK: vabs.s8 q +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s8 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <16 x i8> zeroinitializer, %a %b = icmp slt <16 x i8> %a, zeroinitializer %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a @@ -38,7 +63,13 @@ define <16 x i8> @test4(<16 x i8> %a) nounwind { define <4 x i32> @test5(<4 x i32> %a) nounwind { ; CHECK-LABEL: test5: -; CHECK: vabs.s32 q +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <4 x i32> zeroinitializer, %a %b = icmp sle <4 x i32> %a, zeroinitializer %abs = select <4 x i1> %b, <4 x i32> %tmp1neg, <4 x i32> %a @@ -47,7 +78,11 @@ define <4 x i32> @test5(<4 x i32> %a) nounwind { define <2 x i32> @test6(<2 x i32> %a) nounwind { ; CHECK-LABEL: test6: -; CHECK: vabs.s32 d +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s32 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <2 x i32> zeroinitializer, %a %b = icmp sgt <2 x i32> %a, %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg @@ -56,7 +91,11 @@ define <2 x i32> @test6(<2 x i32> %a) nounwind { define <2 x i32> @test7(<2 x i32> %a) nounwind { ; CHECK-LABEL: test7: -; CHECK: vabs.s32 d +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s32 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <2 x i32> zeroinitializer, %a %b = icmp sge <2 x i32> %a, zeroinitializer %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg @@ -65,7 +104,11 @@ define <2 x i32> @test7(<2 x i32> %a) nounwind { define <4 x i16> @test8(<4 x i16> %a) nounwind { ; CHECK-LABEL: test8: -; CHECK: vabs.s16 d +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s16 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <4 x i16> zeroinitializer, %a %b = icmp sgt <4 x i16> %a, zeroinitializer %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg @@ -74,7 +117,11 @@ define <4 x i16> @test8(<4 x i16> %a) nounwind { define <8 x i8> @test9(<8 x i8> %a) nounwind { ; CHECK-LABEL: test9: -; CHECK: vabs.s8 d +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s8 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <8 x i8> zeroinitializer, %a %b = icmp slt <8 x i8> %a, zeroinitializer %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a @@ -83,7 +130,11 @@ define <8 x i8> @test9(<8 x i8> %a) nounwind { define <2 x i32> @test10(<2 x i32> %a) nounwind { ; CHECK-LABEL: test10: -; CHECK: vabs.s32 d +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabs.s32 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1neg = sub <2 x i32> zeroinitializer, %a %b = icmp sle <2 x i32> %a, zeroinitializer %abs = select <2 x i1> %b, <2 x i32> %tmp1neg, <2 x i32> %a @@ -95,7 +146,13 @@ define <2 x i32> @test10(<2 x i32> %a) nounwind { define <4 x i32> @test11(<4 x i16> %a, <4 x i16> %b) nounwind { ; CHECK-LABEL: test11: -; CHECK: vabdl.u16 q +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vabdl.u16 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %zext1 = zext <4 x i16> %a to <4 x i32> %zext2 = zext <4 x i16> %b to <4 x i32> %diff = sub <4 x i32> %zext1, %zext2 @@ -106,7 +163,13 @@ define <4 x i32> @test11(<4 x i16> %a, <4 x i16> %b) nounwind { } define <8 x i16> @test12(<8 x i8> %a, <8 x i8> %b) nounwind { ; CHECK-LABEL: test12: -; CHECK: vabdl.u8 q +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vabdl.u8 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %zext1 = zext <8 x i8> %a to <8 x i16> %zext2 = zext <8 x i8> %b to <8 x i16> %diff = sub <8 x i16> %zext1, %zext2 @@ -118,7 +181,13 @@ define <8 x i16> @test12(<8 x i8> %a, <8 x i8> %b) nounwind { define <2 x i64> @test13(<2 x i32> %a, <2 x i32> %b) nounwind { ; CHECK-LABEL: test13: -; CHECK: vabdl.u32 q +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vabdl.u32 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %zext1 = zext <2 x i32> %a to <2 x i64> %zext2 = zext <2 x i32> %b to <2 x i64> %diff = sub <2 x i64> %zext1, %zext2 diff --git a/test/CodeGen/ARM/no-cmov2bfi.ll b/test/CodeGen/ARM/no-cmov2bfi.ll new file mode 100644 index 000000000000..c8b512048905 --- /dev/null +++ b/test/CodeGen/ARM/no-cmov2bfi.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -mtriple=thumbv7 | FileCheck --check-prefix=CHECK-NOBFI %s + +declare zeroext i1 @dummy() + +define i8 @test(i8 %a1, i1 %c) { +; CHECK-NOBFI-NOT: bfi +; CHECK-NOBFI: bl dummy +; CHECK-NOBFI: cmp r0, #0 +; CHECK-NOBFI: it ne +; CHECK-NOBFI: orrne [[REG:r[0-9]+]], [[REG]], #8 +; CHECK-NOBFI: mov r0, [[REG]] + + %1 = and i8 %a1, -9 + %2 = select i1 %c, i8 %1, i8 %a1 + %3 = tail call zeroext i1 @dummy() + %4 = or i8 %2, 8 + %ret = select i1 %3, i8 %4, i8 %2 + ret i8 %ret +} diff --git a/test/CodeGen/ARM/phi.ll b/test/CodeGen/ARM/phi.ll index ff85052175c8..568f7572b32e 100644 --- a/test/CodeGen/ARM/phi.ll +++ b/test/CodeGen/ARM/phi.ll @@ -1,5 +1,4 @@ ; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s -; RUN: llc -mtriple=arm-eabi -mattr=+v4t -addr-sink-using-gep=1 %s -o - | FileCheck %s ; diff --git a/test/CodeGen/ARM/pr32545.ll b/test/CodeGen/ARM/pr32545.ll new file mode 100644 index 000000000000..5bfb01b45983 --- /dev/null +++ b/test/CodeGen/ARM/pr32545.ll @@ -0,0 +1,22 @@ +; RUN: llc %s -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv7--linux-gnueabi" + +; CHECK: vld1.16 {[[DREG:d[0-9]+]][0]}, {{.*}} +; CHECK: vmovl.u8 [[QREG:q[0-9]+]], [[DREG]] +; CHECK: vmovl.u16 [[QREG]], [[DREG]] + +define void @f(i32 %dstStride, i8* %indvars.iv, <2 x i8>* %zz) { +entry: + br label %for.body + +for.body: + %tmp = load <2 x i8>, <2 x i8>* %zz, align 1 + %tmp1 = extractelement <2 x i8> %tmp, i32 0 + %.lhs.rhs = zext i8 %tmp1 to i32 + call void @g(i32 %.lhs.rhs) + br label %for.body +} + +declare void @g(i32) diff --git a/test/CodeGen/ARM/prera-ldst-aliasing.mir b/test/CodeGen/ARM/prera-ldst-aliasing.mir new file mode 100644 index 000000000000..ce37106ed8d2 --- /dev/null +++ b/test/CodeGen/ARM/prera-ldst-aliasing.mir @@ -0,0 +1,40 @@ +# RUN: llc -run-pass arm-prera-ldst-opt %s -o - | FileCheck %s +--- | + target triple = "thumbv7---eabi" + + define void @ldrd_strd_aa(i32* noalias nocapture %x, i32* noalias nocapture readonly %y) { + entry: + %0 = load i32, i32* %y, align 4 + store i32 %0, i32* %x, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %y, i32 1 + %1 = load i32, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %x, i32 1 + store i32 %1, i32* %arrayidx3, align 4 + ret void + } +... +--- +name: ldrd_strd_aa +alignment: 1 +tracksRegLiveness: true +liveins: + - { reg: '%r0', virtual-reg: '%0' } + - { reg: '%r1', virtual-reg: '%1' } +body: | + bb.0.entry: + liveins: %r0, %r1 + + %1 : gpr = COPY %r1 + %0 : gpr = COPY %r0 + %2 : gpr = t2LDRi12 %1, 0, 14, _ :: (load 4 from %ir.y) + t2STRi12 killed %2, %0, 0, 14, _ :: (store 4 into %ir.x) + %3 : gpr = t2LDRi12 %1, 4, 14, _ :: (load 4 from %ir.arrayidx2) + t2STRi12 killed %3, %0, 4, 14, _ :: (store 4 into %ir.arrayidx3) + ; CHECK: t2LDRi12 + ; CHECK-NEXT: t2LDRi12 + ; CHECK-NEXT: t2STRi12 + ; CHECK-NEXT: t2STRi12 + tBX_RET 14, _ + +... + diff --git a/test/CodeGen/ARM/prera-ldst-insertpt.mir b/test/CodeGen/ARM/prera-ldst-insertpt.mir new file mode 100644 index 000000000000..eafcc7c36d33 --- /dev/null +++ b/test/CodeGen/ARM/prera-ldst-insertpt.mir @@ -0,0 +1,105 @@ +# RUN: llc -run-pass arm-prera-ldst-opt %s -o - | FileCheck %s +--- | + target triple = "thumbv7---eabi" + + define void @a(i32* nocapture %x, i32 %y, i32 %z) { + entry: + ret void + } + + define void @b(i32* nocapture %x, i32 %y, i32 %z) { + entry: + ret void + } +... +--- +# CHECK-LABEL: name: a +name: a +alignment: 1 +tracksRegLiveness: true +liveins: + - { reg: '%r0', virtual-reg: '%0' } + - { reg: '%r1', virtual-reg: '%1' } + - { reg: '%r2', virtual-reg: '%2' } +body: | + bb.0.entry: + liveins: %r0, %r1, %r2 + + %2 : rgpr = COPY %r2 + %1 : rgpr = COPY %r1 + %0 : gpr = COPY %r0 + %3 : rgpr = t2MUL %2, %2, 14, _ + %4 : rgpr = t2MUL %1, %1, 14, _ + %5 : rgpr = t2MOVi32imm -858993459 + %6 : rgpr, %7 : rgpr = t2UMULL killed %3, %5, 14, _ + %8 : rgpr, %9 : rgpr = t2UMULL killed %4, %5, 14, _ + t2STRi12 %1, %0, 0, 14, _ :: (store 4) + %10 : rgpr = t2LSLri %2, 1, 14, _, _ + t2STRi12 killed %10, %0, 4, 14, _ :: (store 4) + + ; Make sure we move the paired stores next to each other, and + ; insert them in an appropriate location. + ; CHECK: t2STRi12 %1, + ; CHECK-NEXT: t2STRi12 killed %10, + ; CHECK-NEXT: t2MOVi + ; CHECK-NEXT: t2ADDrs + + %11 : rgpr = t2MOVi 55, 14, _, _ + %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _ + t2STRi12 killed %12, %0, 16, 14, _ :: (store 4) + %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _ + t2STRi12 killed %13, %0, 20, 14, _ :: (store 4) + + ; Make sure we move the paired stores next to each other. + ; CHECK: t2STRi12 killed %12, + ; CHECK-NEXT: t2STRi12 killed %13, + + tBX_RET 14, _ +--- +# CHECK-LABEL: name: b +name: b +alignment: 1 +tracksRegLiveness: true +liveins: + - { reg: '%r0', virtual-reg: '%0' } + - { reg: '%r1', virtual-reg: '%1' } + - { reg: '%r2', virtual-reg: '%2' } +body: | + bb.0.entry: + liveins: %r0, %r1, %r2 + + %2 : rgpr = COPY %r2 + %1 : rgpr = COPY %r1 + %0 : gpr = COPY %r0 + t2STRi12 %1, %0, 0, 14, _ :: (store 4) + %10 : rgpr = t2LSLri %2, 1, 14, _, _ + t2STRi12 killed %10, %0, 4, 14, _ :: (store 4) + %3 : rgpr = t2MUL %2, %2, 14, _ + t2STRi12 %3, %0, 8, 14, _ :: (store 4) + + ; Make sure we move the paired stores next to each other, and + ; insert them in an appropriate location. + ; CHECK: t2STRi12 {{.*}}, 0 + ; CHECK-NEXT: t2STRi12 {{.*}}, 4 + ; CHECK-NEXT: t2STRi12 {{.*}}, 8 + ; CHECK-NEXT: t2MUL + ; CHECK-NEXT: t2MOVi32imm + + %4 : rgpr = t2MUL %1, %1, 14, _ + %5 : rgpr = t2MOVi32imm -858993459 + %6 : rgpr, %7 : rgpr = t2UMULL killed %3, %5, 14, _ + %8 : rgpr, %9 : rgpr = t2UMULL killed %4, %5, 14, _ + %10 : rgpr = t2LSLri %2, 1, 14, _, _ + %11 : rgpr = t2MOVi 55, 14, _, _ + %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _ + t2STRi12 killed %12, %0, 16, 14, _ :: (store 4) + %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _ + t2STRi12 killed %13, %0, 20, 14, _ :: (store 4) + + ; Make sure we move the paired stores next to each other. + ; CHECK: t2STRi12 {{.*}}, 16 + ; CHECK-NEXT: t2STRi12 {{.*}}, 20 + + tBX_RET 14, _ + +... diff --git a/test/CodeGen/ARM/rbit.ll b/test/CodeGen/ARM/rbit.ll index a2bfeca75526..c8badfb32370 100644 --- a/test/CodeGen/ARM/rbit.ll +++ b/test/CodeGen/ARM/rbit.ll @@ -10,7 +10,8 @@ entry: ; CHECK-LABEL: rbit_constant ; CHECK: mov r0, #0 -; CHECK: rbit r0, r0 +; CHECK-NOT: rbit +; CHECK: bx lr define i32 @rbit_constant() { entry: %rbit.i = call i32 @llvm.arm.rbit(i32 0) diff --git a/test/CodeGen/ARM/rev.ll b/test/CodeGen/ARM/rev.ll index f95f97105b9f..a36526ff1fb0 100644 --- a/test/CodeGen/ARM/rev.ll +++ b/test/CodeGen/ARM/rev.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s define i32 @test1(i32 %X) nounwind { -; CHECK: test1 +; CHECK-LABEL: test1 ; CHECK: rev16 r0, r0 %tmp1 = lshr i32 %X, 8 %X15 = bitcast i32 %X to i32 @@ -17,7 +17,7 @@ define i32 @test1(i32 %X) nounwind { } define i32 @test2(i32 %X) nounwind { -; CHECK: test2 +; CHECK-LABEL: test2 ; CHECK: revsh r0, r0 %tmp1 = lshr i32 %X, 8 %tmp1.upgrd.1 = trunc i32 %tmp1 to i16 @@ -58,7 +58,7 @@ entry: ; rdar://9609059 define i32 @test5(i32 %i) nounwind readnone { entry: -; CHECK: test5 +; CHECK-LABEL: test5 ; CHECK: revsh r0, r0 %shl = shl i32 %i, 24 %shr = ashr exact i32 %shl, 16 @@ -71,7 +71,7 @@ entry: ; rdar://9609108 define i32 @test6(i32 %x) nounwind readnone { entry: -; CHECK: test6 +; CHECK-LABEL: test6 ; CHECK: rev16 r0, r0 %and = shl i32 %x, 8 %shl = and i32 %and, 65280 @@ -88,7 +88,7 @@ entry: ; rdar://9164521 define i32 @test7(i32 %a) nounwind readnone { entry: -; CHECK: test7 +; CHECK-LABEL: test7 ; CHECK: rev r0, r0 ; CHECK: lsr r0, r0, #16 %and = lshr i32 %a, 8 @@ -101,7 +101,7 @@ entry: define i32 @test8(i32 %a) nounwind readnone { entry: -; CHECK: test8 +; CHECK-LABEL: test8 ; CHECK: revsh r0, r0 %and = lshr i32 %a, 8 %shr4 = and i32 %and, 255 @@ -115,7 +115,7 @@ entry: ; rdar://10750814 define zeroext i16 @test9(i16 zeroext %v) nounwind readnone { entry: -; CHECK: test9 +; CHECK-LABEL: test9 ; CHECK: rev16 r0, r0 %conv = zext i16 %v to i32 %shr4 = lshr i32 %conv, 8 diff --git a/test/CodeGen/ARM/select_const.ll b/test/CodeGen/ARM/select_const.ll new file mode 100644 index 000000000000..48fe572bf8a7 --- /dev/null +++ b/test/CodeGen/ARM/select_const.ll @@ -0,0 +1,326 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-eabi-unknown-unknown | FileCheck %s + +; Select of constants: control flow / conditional moves can always be replaced by logic+math (but may not be worth it?). +; Test the zeroext/signext variants of each pattern to see if that makes a difference. + +; select Cond, 0, 1 --> zext (!Cond) + +define i32 @select_0_or_1(i1 %cond) { +; CHECK-LABEL: select_0_or_1: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #1 +; CHECK-NEXT: bic r0, r1, r0 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 0, i32 1 + ret i32 %sel +} + +define i32 @select_0_or_1_zeroext(i1 zeroext %cond) { +; CHECK-LABEL: select_0_or_1_zeroext: +; CHECK: @ BB#0: +; CHECK-NEXT: eor r0, r0, #1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 0, i32 1 + ret i32 %sel +} + +define i32 @select_0_or_1_signext(i1 signext %cond) { +; CHECK-LABEL: select_0_or_1_signext: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #1 +; CHECK-NEXT: bic r0, r1, r0 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 0, i32 1 + ret i32 %sel +} + +; select Cond, 1, 0 --> zext (Cond) + +define i32 @select_1_or_0(i1 %cond) { +; CHECK-LABEL: select_1_or_0: +; CHECK: @ BB#0: +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 1, i32 0 + ret i32 %sel +} + +define i32 @select_1_or_0_zeroext(i1 zeroext %cond) { +; CHECK-LABEL: select_1_or_0_zeroext: +; CHECK: @ BB#0: +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 1, i32 0 + ret i32 %sel +} + +define i32 @select_1_or_0_signext(i1 signext %cond) { +; CHECK-LABEL: select_1_or_0_signext: +; CHECK: @ BB#0: +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 1, i32 0 + ret i32 %sel +} + +; select Cond, 0, -1 --> sext (!Cond) + +define i32 @select_0_or_neg1(i1 %cond) { +; CHECK-LABEL: select_0_or_neg1: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #1 +; CHECK-NEXT: bic r0, r1, r0 +; CHECK-NEXT: rsb r0, r0, #0 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 0, i32 -1 + ret i32 %sel +} + +define i32 @select_0_or_neg1_zeroext(i1 zeroext %cond) { +; CHECK-LABEL: select_0_or_neg1_zeroext: +; CHECK: @ BB#0: +; CHECK-NEXT: eor r0, r0, #1 +; CHECK-NEXT: rsb r0, r0, #0 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 0, i32 -1 + ret i32 %sel +} + +define i32 @select_0_or_neg1_signext(i1 signext %cond) { +; CHECK-LABEL: select_0_or_neg1_signext: +; CHECK: @ BB#0: +; CHECK-NEXT: mvn r0, r0 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 0, i32 -1 + ret i32 %sel +} + +define i32 @select_0_or_neg1_alt(i1 %cond) { +; CHECK-LABEL: select_0_or_neg1_alt: +; CHECK: @ BB#0: +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: sub r0, r0, #1 +; CHECK-NEXT: mov pc, lr + %z = zext i1 %cond to i32 + %add = add i32 %z, -1 + ret i32 %add +} + +define i32 @select_0_or_neg1_alt_zeroext(i1 zeroext %cond) { +; CHECK-LABEL: select_0_or_neg1_alt_zeroext: +; CHECK: @ BB#0: +; CHECK-NEXT: sub r0, r0, #1 +; CHECK-NEXT: mov pc, lr + %z = zext i1 %cond to i32 + %add = add i32 %z, -1 + ret i32 %add +} + +define i32 @select_0_or_neg1_alt_signext(i1 signext %cond) { +; CHECK-LABEL: select_0_or_neg1_alt_signext: +; CHECK: @ BB#0: +; CHECK-NEXT: mvn r0, r0 +; CHECK-NEXT: mov pc, lr + %z = zext i1 %cond to i32 + %add = add i32 %z, -1 + ret i32 %add +} + +; select Cond, -1, 0 --> sext (Cond) + +define i32 @select_neg1_or_0(i1 %cond) { +; CHECK-LABEL: select_neg1_or_0: +; CHECK: @ BB#0: +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: rsb r0, r0, #0 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 -1, i32 0 + ret i32 %sel +} + +define i32 @select_neg1_or_0_zeroext(i1 zeroext %cond) { +; CHECK-LABEL: select_neg1_or_0_zeroext: +; CHECK: @ BB#0: +; CHECK-NEXT: rsb r0, r0, #0 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 -1, i32 0 + ret i32 %sel +} + +define i32 @select_neg1_or_0_signext(i1 signext %cond) { +; CHECK-LABEL: select_neg1_or_0_signext: +; CHECK: @ BB#0: +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 -1, i32 0 + ret i32 %sel +} + +; select Cond, C+1, C --> add (zext Cond), C + +define i32 @select_Cplus1_C(i1 %cond) { +; CHECK-LABEL: select_Cplus1_C: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #41 +; CHECK-NEXT: tst r0, #1 +; CHECK-NEXT: movne r1, #42 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 42, i32 41 + ret i32 %sel +} + +define i32 @select_Cplus1_C_zeroext(i1 zeroext %cond) { +; CHECK-LABEL: select_Cplus1_C_zeroext: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #41 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: movne r1, #42 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 42, i32 41 + ret i32 %sel +} + +define i32 @select_Cplus1_C_signext(i1 signext %cond) { +; CHECK-LABEL: select_Cplus1_C_signext: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #41 +; CHECK-NEXT: tst r0, #1 +; CHECK-NEXT: movne r1, #42 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 42, i32 41 + ret i32 %sel +} + +; select Cond, C, C+1 --> add (sext Cond), C + +define i32 @select_C_Cplus1(i1 %cond) { +; CHECK-LABEL: select_C_Cplus1: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #42 +; CHECK-NEXT: tst r0, #1 +; CHECK-NEXT: movne r1, #41 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 41, i32 42 + ret i32 %sel +} + +define i32 @select_C_Cplus1_zeroext(i1 zeroext %cond) { +; CHECK-LABEL: select_C_Cplus1_zeroext: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #42 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: movne r1, #41 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 41, i32 42 + ret i32 %sel +} + +define i32 @select_C_Cplus1_signext(i1 signext %cond) { +; CHECK-LABEL: select_C_Cplus1_signext: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #42 +; CHECK-NEXT: tst r0, #1 +; CHECK-NEXT: movne r1, #41 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 41, i32 42 + ret i32 %sel +} + +; In general, select of 2 constants could be: +; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> add (and (sext Cond), C1-C2), C2 + +define i32 @select_C1_C2(i1 %cond) { +; CHECK-LABEL: select_C1_C2: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #165 +; CHECK-NEXT: tst r0, #1 +; CHECK-NEXT: orr r1, r1, #256 +; CHECK-NEXT: moveq r1, #42 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 421, i32 42 + ret i32 %sel +} + +define i32 @select_C1_C2_zeroext(i1 zeroext %cond) { +; CHECK-LABEL: select_C1_C2_zeroext: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #165 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: orr r1, r1, #256 +; CHECK-NEXT: moveq r1, #42 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 421, i32 42 + ret i32 %sel +} + +define i32 @select_C1_C2_signext(i1 signext %cond) { +; CHECK-LABEL: select_C1_C2_signext: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #165 +; CHECK-NEXT: tst r0, #1 +; CHECK-NEXT: orr r1, r1, #256 +; CHECK-NEXT: moveq r1, #42 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i32 421, i32 42 + ret i32 %sel +} + +; 4295032833 = 0x100010001. +; This becomes an opaque constant via ConstantHoisting, so we don't fold it into the select. + +define i64 @opaque_constant1(i1 %cond, i64 %x) { +; CHECK-LABEL: opaque_constant1: +; CHECK: @ BB#0: +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: ands r12, r0, #1 +; CHECK-NEXT: mov lr, #1 +; CHECK-NEXT: mov r0, #23 +; CHECK-NEXT: eor r3, r3, #1 +; CHECK-NEXT: orr lr, lr, #65536 +; CHECK-NEXT: mvnne r0, #3 +; CHECK-NEXT: movne r12, #1 +; CHECK-NEXT: and r4, r0, lr +; CHECK-NEXT: eor r2, r2, lr +; CHECK-NEXT: subs r0, r4, #1 +; CHECK-NEXT: sbc r1, r12, #0 +; CHECK-NEXT: orrs r2, r2, r3 +; CHECK-NEXT: movne r0, r4 +; CHECK-NEXT: movne r1, r12 +; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i64 -4, i64 23 + %bo = and i64 %sel, 4295032833 ; 0x100010001 + %cmp = icmp eq i64 %x, 4295032833 + %sext = sext i1 %cmp to i64 + %add = add i64 %bo, %sext + ret i64 %add +} + +; 65537 == 0x10001. +; This becomes an opaque constant via ConstantHoisting, so we don't fold it into the select. + +define i64 @opaque_constant2(i1 %cond, i64 %x) { +; CHECK-LABEL: opaque_constant2: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r1, #1 +; CHECK-NEXT: tst r0, #1 +; CHECK-NEXT: orr r1, r1, #65536 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: moveq r0, #23 +; CHECK-NEXT: and r0, r0, r1 +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: mov pc, lr + %sel = select i1 %cond, i64 65537, i64 23 + %bo = and i64 %sel, 65537 + ret i64 %bo +} + diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll index 8c1502e14655..09e8ed4bc096 100644 --- a/test/CodeGen/ARM/select_xform.ll +++ b/test/CodeGen/ARM/select_xform.ll @@ -223,21 +223,19 @@ entry: ret i32 %add } -; Do not fold the xor into the select +; Fold the xor into the select. define i32 @t15(i32 %p) { entry: ; ARM-LABEL: t15: -; ARM: mov [[REG:r[0-9]+]], #2 +; ARM: mov [[REG:r[0-9]+]], #3 ; ARM: cmp r0, #8 -; ARM: movwgt [[REG:r[0-9]+]], #1 -; ARM: eor r0, [[REG:r[0-9]+]], #1 +; ARM: movwgt [[REG:r[0-9]+]], #0 ; T2-LABEL: t15: -; T2: movs [[REG:r[0-9]+]], #2 +; T2: movs [[REG:r[0-9]+]], #3 ; T2: cmp [[REG:r[0-9]+]], #8 ; T2: it gt -; T2: movgt [[REG:r[0-9]+]], #1 -; T2: eor r0, [[REG:r[0-9]+]], #1 +; T2: movgt [[REG:r[0-9]+]], #0 %cmp = icmp sgt i32 %p, 8 %a = select i1 %cmp, i32 1, i32 2 %xor = xor i32 %a, 1 diff --git a/test/CodeGen/ARM/setcc-logic.ll b/test/CodeGen/ARM/setcc-logic.ll new file mode 100644 index 000000000000..79bae1facb3e --- /dev/null +++ b/test/CodeGen/ARM/setcc-logic.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 | FileCheck %s + +define zeroext i1 @ne_neg1_and_ne_zero(i32 %x) nounwind { +; CHECK-LABEL: ne_neg1_and_ne_zero: +; CHECK: @ BB#0: +; CHECK-NEXT: add r1, r0, #1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: cmp r1, #1 +; CHECK-NEXT: movwhi r0, #1 +; CHECK-NEXT: bx lr + %cmp1 = icmp ne i32 %x, -1 + %cmp2 = icmp ne i32 %x, 0 + %and = and i1 %cmp1, %cmp2 + ret i1 %and +} + +; PR32401 - https://bugs.llvm.org/show_bug.cgi?id=32401 + +define zeroext i1 @and_eq(i32 %a, i32 %b, i32 %c, i32 %d) nounwind { +; CHECK-LABEL: and_eq: +; CHECK: @ BB#0: +; CHECK-NEXT: eor r2, r2, r3 +; CHECK-NEXT: eor r0, r0, r1 +; CHECK-NEXT: orrs r0, r0, r2 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movweq r0, #1 +; CHECK-NEXT: bx lr + %cmp1 = icmp eq i32 %a, %b + %cmp2 = icmp eq i32 %c, %d + %and = and i1 %cmp1, %cmp2 + ret i1 %and +} + +define zeroext i1 @or_ne(i32 %a, i32 %b, i32 %c, i32 %d) nounwind { +; CHECK-LABEL: or_ne: +; CHECK: @ BB#0: +; CHECK-NEXT: eor r2, r2, r3 +; CHECK-NEXT: eor r0, r0, r1 +; CHECK-NEXT: orrs r0, r0, r2 +; CHECK-NEXT: movwne r0, #1 +; CHECK-NEXT: bx lr + %cmp1 = icmp ne i32 %a, %b + %cmp2 = icmp ne i32 %c, %d + %or = or i1 %cmp1, %cmp2 + ret i1 %or +} + +define <4 x i1> @and_eq_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind { +; CHECK-LABEL: and_eq_vec: +; CHECK: @ BB#0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vmov d19, r2, r3 +; CHECK-NEXT: add r12, sp, #40 +; CHECK-NEXT: add lr, sp, #8 +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vld1.64 {d16, d17}, [lr] +; CHECK-NEXT: add r0, sp, #24 +; CHECK-NEXT: vld1.64 {d20, d21}, [r12] +; CHECK-NEXT: vceq.i32 q8, q9, q8 +; CHECK-NEXT: vld1.64 {d22, d23}, [r0] +; CHECK-NEXT: vceq.i32 q9, q11, q10 +; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vmovn.i32 d17, q9 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: pop {r11, pc} + %cmp1 = icmp eq <4 x i32> %a, %b + %cmp2 = icmp eq <4 x i32> %c, %d + %and = and <4 x i1> %cmp1, %cmp2 + ret <4 x i1> %and +} + diff --git a/test/CodeGen/ARM/setcc-sentinals.ll b/test/CodeGen/ARM/setcc-sentinals.ll deleted file mode 100644 index dc45e0e13881..000000000000 --- a/test/CodeGen/ARM/setcc-sentinals.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 -asm-verbose=false %s -o - | FileCheck %s - -define zeroext i1 @test0(i32 %x) nounwind { -; CHECK-LABEL: test0: -; CHECK: add [[REG:(r[0-9]+)|(lr)]], r0, #1 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: cmp [[REG]], #1 -; CHECK-NEXT: movwhi r0, #1 -; CHECK-NEXT: bx lr - %cmp1 = icmp ne i32 %x, -1 - %not.cmp = icmp ne i32 %x, 0 - %.cmp1 = and i1 %cmp1, %not.cmp - ret i1 %.cmp1 -} diff --git a/test/CodeGen/ARM/single-issue-r52.mir b/test/CodeGen/ARM/single-issue-r52.mir new file mode 100644 index 000000000000..6c95f7603e6e --- /dev/null +++ b/test/CodeGen/ARM/single-issue-r52.mir @@ -0,0 +1,86 @@ +# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -debug-only=misched -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN +# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -debug-only=misched -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP +# REQUIRES: asserts +--- | + ; ModuleID = 'foo.ll' + source_filename = "foo.ll" + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "arm---eabi" + + %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } + ; Function Attrs: nounwind + define <8 x i8> @foo(i8* %A) { + %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 8) + %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 1 + %tmp4 = add <8 x i8> %tmp2, %tmp3 + ret <8 x i8> %tmp4 + } + declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8*, i32) + +# CHECK: ********** MI Scheduling ********** +# CHECK: ScheduleDAGMILive::schedule starting +# CHECK: SU(1): %vreg1 = VLD4d8Pseudo %vreg0, 8, pred:14, pred:%noreg; mem:LD32[%A](align=8) QQPR:%vreg1 GPR:%vreg0 +# CHECK: Latency : 8 +# CHECK: Single Issue : true; +# CHECK: SU(2): %vreg4 = VADDv8i8 %vreg1:dsub_0, %vreg1:dsub_1, pred:14, pred:%noreg; DPR:%vreg4 QQPR:%vreg1 +# CHECK: Latency : 5 +# CHECK: Single Issue : false; +# CHECK: SU(3): %vreg5, %vreg6 = VMOVRRD %vreg4, pred:14, pred:%noreg; GPR:%vreg5,%vreg6 DPR:%vreg4 +# CHECK: Latency : 4 +# CHECK: Single Issue : false; + +# TOPDOWN: Scheduling SU(1) %vreg1 = VLD4d8Pseudo +# TOPDOWN: Bump cycle to end group +# TOPDOWN: Scheduling SU(2) %vreg4 = VADDv8i8 + +# BOTTOMUP: Scheduling SU(2) %vreg4 = VADDv8i8 +# BOTTOMUP: Scheduling SU(1) %vreg1 = VLD4d8Pseudo +# BOTTOMUP: Bump cycle to begin group + +... +--- +name: foo +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: gpr } + - { id: 1, class: qqpr } + - { id: 2, class: dpr } + - { id: 3, class: dpr } + - { id: 4, class: dpr } + - { id: 5, class: gpr } + - { id: 6, class: gpr } +liveins: + - { reg: '%r0', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %r0 + + %0 = COPY %r0 + %1 = VLD4d8Pseudo %0, 8, 14, _ :: (load 32 from %ir.A, align 8) + %4 = VADDv8i8 %1.dsub_0, %1.dsub_1, 14, _ + %5, %6 = VMOVRRD %4, 14, _ + %r0 = COPY %5 + %r1 = COPY %6 + BX_RET 14, _, implicit %r0, implicit killed %r1 + +... diff --git a/test/CodeGen/ARM/sjljeh-swifterror.ll b/test/CodeGen/ARM/sjljeh-swifterror.ll new file mode 100644 index 000000000000..aae0e75c98af --- /dev/null +++ b/test/CodeGen/ARM/sjljeh-swifterror.ll @@ -0,0 +1,27 @@ +; RUN: opt -sjljehprepare -verify < %s | FileCheck %s +target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" +target triple = "armv7s-apple-ios7.0" + +%swift.error = type opaque + +declare void @objc_msgSend() local_unnamed_addr + +declare i32 @__objc_personality_v0(...) + +; Make sure we don't leave a select on a swifterror argument. +; CHECK-LABEL; @test +; CHECK-NOT: select true, %0 +define swiftcc void @test(%swift.error** swifterror) local_unnamed_addr personality i32 (...)* @__objc_personality_v0 { +entry: + %call28.i = invoke i32 bitcast (void ()* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef) + to label %invoke.cont.i unwind label %lpad.i + +invoke.cont.i: + unreachable + +lpad.i: + %1 = landingpad { i8*, i32 } + cleanup + resume { i8*, i32 } undef +} + diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll index aa093192f2b2..4788644cf195 100644 --- a/test/CodeGen/ARM/smml.ll +++ b/test/CodeGen/ARM/smml.ll @@ -1,20 +1,15 @@ -; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -; RUN: llc -mtriple=armv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6 -; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7 -; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s -check-prefix=CHECK-THUMB -; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK-THUMB -; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK-THUMBV6T2 -; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-THUMBV7 +; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V4 +; RUN: llc -mtriple=armv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V6 +; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V6 +; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMB +; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMBV6 +; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMBV6T2 +; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMBV6T2 define i32 @Test0(i32 %a, i32 %b, i32 %c) nounwind readnone ssp { entry: ; CHECK-LABEL: Test0 ; CHECK-NOT: smmls -; CHECK-V6-NOT: smmls -; CHECK-V7-NOT: smmls -; CHECK_THUMB-NOT: smmls -; CHECK-THUMBV6T2-NOT: smmls -; CHECK-THUMBV7-NOT: smmls %conv4 = zext i32 %a to i64 %conv1 = sext i32 %b to i64 %conv2 = sext i32 %c to i64 @@ -27,12 +22,11 @@ entry: define i32 @Test1(i32 %a, i32 %b, i32 %c) { ;CHECK-LABEL: Test1 -;CHECK-NOT: smmls +;CHECK-V4-NOT: smmls ;CHECK-THUMB-NOT: smmls +;CHECK-THUMBV6-NOT: smmls ;CHECK-V6: smmls r0, [[Rn:r[1-2]]], [[Rm:r[1-2]]], r0 -;CHECK-V7: smmls r0, [[Rn:r[1-2]]], [[Rm:r[1-2]]], r0 ;CHECK-THUMBV6T2: smmls r0, [[Rn:r[1-2]]], [[Rm:r[1-2]]], r0 -;CHECK-THUMBV7: smmls r0, [[Rn:r[1-2]]], [[Rm:r[1-2]]], r0 entry: %conv = sext i32 %b to i64 %conv1 = sext i32 %c to i64 @@ -47,10 +41,21 @@ entry: declare void @opaque(i32) define void @test_used_flags(i32 %in1, i32 %in2) { -; CHECK-V7-LABEL: test_used_flags: -; CHECK-V7: smull [[PROD_LO:r[0-9]+]], [[PROD_HI:r[0-9]+]], r0, r1 -; CHECK-V7: rsbs {{.*}}, [[PROD_LO]], #0 -; CHECK-V7: rscs {{.*}}, [[PROD_HI]], #0 +; CHECK-LABEL: test_used_flags: +; CHECK-THUMB: cmp r1, #0 +; CHECK-THUMB: push {r2} +; CHECK-THUMB: pop {r3} +; CHECK-THUMB: ble +; CHECK-THUMBV6: cmp r1, #0 +; CHECK-THUMBV6: mov r3, r2 +; CHECK-THUMBV6: ble +; CHECK-V6: smull [[PROD_LO:r[0-9]+]], [[PROD_HI:r[0-9]+]], r0, r1 +; CHECK-V6: rsbs {{.*}}, [[PROD_LO]], #0 +; CHECK-V6: rscs {{.*}}, [[PROD_HI]], #0 +; CHECK-THUMBV6T2: smull [[PROD_LO:r[0-9]+]], [[PROD_HI:r[0-9]+]], r0, r1 +; CHECK-THUMBV6T2: movs [[ZERO:r[0-9]+]], #0 +; CHECK-THUMBV6T2: rsbs {{.*}}, [[PROD_LO]], #0 +; CHECK-THUMBV6T2: sbcs.w {{.*}}, [[ZERO]], [[PROD_HI]] %in1.64 = sext i32 %in1 to i64 %in2.64 = sext i32 %in2 to i64 %mul = mul nsw i64 %in1.64, %in2.64 diff --git a/test/CodeGen/ARM/smul.ll b/test/CodeGen/ARM/smul.ll index 3c187aa846d5..2b7be41ddb24 100644 --- a/test/CodeGen/ARM/smul.ll +++ b/test/CodeGen/ARM/smul.ll @@ -262,3 +262,32 @@ define i32 @f21(i32 %a, i32 %x, i16 %y) { %tmp5 = add i32 %a, %tmp4 ret i32 %tmp5 } + +@global_b = external global i16, align 2 + +define i32 @f22(i32 %a) { +; CHECK-LABEL: f22: +; CHECK: smulwb r0, r0, r1 +; CHECK-THUMBV6-NOT: smulwb + %b = load i16, i16* @global_b, align 2 + %sext = sext i16 %b to i64 + %conv = sext i32 %a to i64 + %mul = mul nsw i64 %sext, %conv + %shr37 = lshr i64 %mul, 16 + %conv4 = trunc i64 %shr37 to i32 + ret i32 %conv4 +} + +define i32 @f23(i32 %a, i32 %c) { +; CHECK-LABEL: f23: +; CHECK: smlawb r0, r0, r2, r1 +; CHECK-THUMBV6-NOT: smlawb + %b = load i16, i16* @global_b, align 2 + %sext = sext i16 %b to i64 + %conv = sext i32 %a to i64 + %mul = mul nsw i64 %sext, %conv + %shr49 = lshr i64 %mul, 16 + %conv5 = trunc i64 %shr49 to i32 + %add = add nsw i32 %conv5, %c + ret i32 %add +} diff --git a/test/CodeGen/ARM/softfp-fabs-fneg.ll b/test/CodeGen/ARM/softfp-fabs-fneg.ll index b608fb840218..b7c684d35b57 100644 --- a/test/CodeGen/ARM/softfp-fabs-fneg.ll +++ b/test/CodeGen/ARM/softfp-fabs-fneg.ll @@ -14,8 +14,7 @@ define double @f(double %a) { define float @g(float %a) { ; CHECK-LABEL: g: - ; CHECK-THUMB: bic r0, r0, #-2147483648 - ; CHECK-ARM: bfc r0, #31, #1 + ; CHECK: bic r0, r0, #-2147483648 ; CHECK-NEXT: bx lr %x = call float @llvm.fabs.f32(float %a) readnone ret float %x diff --git a/test/CodeGen/ARM/special-reg-mcore.ll b/test/CodeGen/ARM/special-reg-mcore.ll index 45e6db9e78fe..1ecf8dc77a70 100644 --- a/test/CodeGen/ARM/special-reg-mcore.ll +++ b/test/CodeGen/ARM/special-reg-mcore.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=MCORE +; RUN: llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 --show-mc-encoding 2>&1 | FileCheck %s --check-prefix=MCORE ; RUN: not llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m3 2>&1 | FileCheck %s --check-prefix=M3CORE ; RUN: not llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ACORE @@ -8,20 +8,20 @@ define i32 @read_mclass_registers() nounwind { entry: ; MCORE-LABEL: read_mclass_registers: - ; MCORE: mrs r0, apsr - ; MCORE: mrs r1, iapsr - ; MCORE: mrs r1, eapsr - ; MCORE: mrs r1, xpsr - ; MCORE: mrs r1, ipsr - ; MCORE: mrs r1, epsr - ; MCORE: mrs r1, iepsr - ; MCORE: mrs r1, msp - ; MCORE: mrs r1, psp - ; MCORE: mrs r1, primask - ; MCORE: mrs r1, basepri - ; MCORE: mrs r1, basepri_max - ; MCORE: mrs r1, faultmask - ; MCORE: mrs r1, control + ; MCORE: mrs r0, apsr @ encoding: [0xef,0xf3,0x00,0x80] + ; MCORE: mrs r1, iapsr @ encoding: [0xef,0xf3,0x01,0x81] + ; MCORE: mrs r1, eapsr @ encoding: [0xef,0xf3,0x02,0x81] + ; MCORE: mrs r1, xpsr @ encoding: [0xef,0xf3,0x03,0x81] + ; MCORE: mrs r1, ipsr @ encoding: [0xef,0xf3,0x05,0x81] + ; MCORE: mrs r1, epsr @ encoding: [0xef,0xf3,0x06,0x81] + ; MCORE: mrs r1, iepsr @ encoding: [0xef,0xf3,0x07,0x81] + ; MCORE: mrs r1, msp @ encoding: [0xef,0xf3,0x08,0x81] + ; MCORE: mrs r1, psp @ encoding: [0xef,0xf3,0x09,0x81] + ; MCORE: mrs r1, primask @ encoding: [0xef,0xf3,0x10,0x81] + ; MCORE: mrs r1, basepri @ encoding: [0xef,0xf3,0x11,0x81] + ; MCORE: mrs r1, basepri_max @ encoding: [0xef,0xf3,0x12,0x81] + ; MCORE: mrs r1, faultmask @ encoding: [0xef,0xf3,0x13,0x81] + ; MCORE: mrs r1, control @ encoding: [0xef,0xf3,0x14,0x81] %0 = call i32 @llvm.read_register.i32(metadata !0) %1 = call i32 @llvm.read_register.i32(metadata !4) @@ -56,32 +56,32 @@ entry: define void @write_mclass_registers(i32 %x) nounwind { entry: ; MCORE-LABEL: write_mclass_registers: - ; MCORE: msr apsr_nzcvqg, r0 - ; MCORE: msr apsr_nzcvq, r0 - ; MCORE: msr apsr_g, r0 - ; MCORE: msr apsr_nzcvqg, r0 - ; MCORE: msr iapsr_nzcvqg, r0 - ; MCORE: msr iapsr_nzcvq, r0 - ; MCORE: msr iapsr_g, r0 - ; MCORE: msr iapsr_nzcvqg, r0 - ; MCORE: msr eapsr_nzcvqg, r0 - ; MCORE: msr eapsr_nzcvq, r0 - ; MCORE: msr eapsr_g, r0 - ; MCORE: msr eapsr_nzcvqg, r0 - ; MCORE: msr xpsr_nzcvqg, r0 - ; MCORE: msr xpsr_nzcvq, r0 - ; MCORE: msr xpsr_g, r0 - ; MCORE: msr xpsr_nzcvqg, r0 - ; MCORE: msr ipsr, r0 - ; MCORE: msr epsr, r0 - ; MCORE: msr iepsr, r0 - ; MCORE: msr msp, r0 - ; MCORE: msr psp, r0 - ; MCORE: msr primask, r0 - ; MCORE: msr basepri, r0 - ; MCORE: msr basepri_max, r0 - ; MCORE: msr faultmask, r0 - ; MCORE: msr control, r0 + ; MCORE: msr apsr_nzcvq, r0 @ encoding: [0x80,0xf3,0x00,0x88] + ; MCORE: msr apsr_nzcvq, r0 @ encoding: [0x80,0xf3,0x00,0x88] + ; MCORE: msr apsr_g, r0 @ encoding: [0x80,0xf3,0x00,0x84] + ; MCORE: msr apsr_nzcvqg, r0 @ encoding: [0x80,0xf3,0x00,0x8c] + ; MCORE: msr iapsr_nzcvq, r0 @ encoding: [0x80,0xf3,0x01,0x88] + ; MCORE: msr iapsr_nzcvq, r0 @ encoding: [0x80,0xf3,0x01,0x88] + ; MCORE: msr iapsr_g, r0 @ encoding: [0x80,0xf3,0x01,0x84] + ; MCORE: msr iapsr_nzcvqg, r0 @ encoding: [0x80,0xf3,0x01,0x8c] + ; MCORE: msr eapsr_nzcvq, r0 @ encoding: [0x80,0xf3,0x02,0x88] + ; MCORE: msr eapsr_nzcvq, r0 @ encoding: [0x80,0xf3,0x02,0x88] + ; MCORE: msr eapsr_g, r0 @ encoding: [0x80,0xf3,0x02,0x84] + ; MCORE: msr eapsr_nzcvqg, r0 @ encoding: [0x80,0xf3,0x02,0x8c] + ; MCORE: msr xpsr_nzcvq, r0 @ encoding: [0x80,0xf3,0x03,0x88] + ; MCORE: msr xpsr_nzcvq, r0 @ encoding: [0x80,0xf3,0x03,0x88] + ; MCORE: msr xpsr_g, r0 @ encoding: [0x80,0xf3,0x03,0x84] + ; MCORE: msr xpsr_nzcvqg, r0 @ encoding: [0x80,0xf3,0x03,0x8c] + ; MCORE: msr ipsr, r0 @ encoding: [0x80,0xf3,0x05,0x88] + ; MCORE: msr epsr, r0 @ encoding: [0x80,0xf3,0x06,0x88] + ; MCORE: msr iepsr, r0 @ encoding: [0x80,0xf3,0x07,0x88] + ; MCORE: msr msp, r0 @ encoding: [0x80,0xf3,0x08,0x88] + ; MCORE: msr psp, r0 @ encoding: [0x80,0xf3,0x09,0x88] + ; MCORE: msr primask, r0 @ encoding: [0x80,0xf3,0x10,0x88] + ; MCORE: msr basepri, r0 @ encoding: [0x80,0xf3,0x11,0x88] + ; MCORE: msr basepri_max, r0 @ encoding: [0x80,0xf3,0x12,0x88] + ; MCORE: msr faultmask, r0 @ encoding: [0x80,0xf3,0x13,0x88] + ; MCORE: msr control, r0 @ encoding: [0x80,0xf3,0x14,0x88] call void @llvm.write_register.i32(metadata !0, i32 %x) call void @llvm.write_register.i32(metadata !1, i32 %x) diff --git a/test/CodeGen/ARM/special-reg-v8m-main.ll b/test/CodeGen/ARM/special-reg-v8m-main.ll index cde296c6b218..ea9c01487d85 100644 --- a/test/CodeGen/ARM/special-reg-v8m-main.ll +++ b/test/CodeGen/ARM/special-reg-v8m-main.ll @@ -90,19 +90,19 @@ entry: define void @write_mclass_registers(i32 %x) nounwind { entry: ; MAINLINE-LABEL: write_mclass_registers: - ; MAINLINE: msr apsr_nzcvqg, r0 + ; MAINLINE: msr apsr_nzcvq, r0 ; MAINLINE: msr apsr_nzcvq, r0 ; MAINLINE: msr apsr_g, r0 ; MAINLINE: msr apsr_nzcvqg, r0 - ; MAINLINE: msr iapsr_nzcvqg, r0 + ; MAINLINE: msr iapsr_nzcvq, r0 ; MAINLINE: msr iapsr_nzcvq, r0 ; MAINLINE: msr iapsr_g, r0 ; MAINLINE: msr iapsr_nzcvqg, r0 - ; MAINLINE: msr eapsr_nzcvqg, r0 + ; MAINLINE: msr eapsr_nzcvq, r0 ; MAINLINE: msr eapsr_nzcvq, r0 ; MAINLINE: msr eapsr_g, r0 ; MAINLINE: msr eapsr_nzcvqg, r0 - ; MAINLINE: msr xpsr_nzcvqg, r0 + ; MAINLINE: msr xpsr_nzcvq, r0 ; MAINLINE: msr xpsr_nzcvq, r0 ; MAINLINE: msr xpsr_g, r0 ; MAINLINE: msr xpsr_nzcvqg, r0 diff --git a/test/CodeGen/ARM/stack_guard_remat.ll b/test/CodeGen/ARM/stack_guard_remat.ll index 99d499498450..9b5677608d26 100644 --- a/test/CodeGen/ARM/stack_guard_remat.ll +++ b/test/CodeGen/ARM/stack_guard_remat.ll @@ -51,20 +51,20 @@ define i32 @test_stack_guard_remat() #0 { %a1 = alloca [256 x i32], align 4 %1 = bitcast [256 x i32]* %a1 to i8* - call void @llvm.lifetime.start(i64 1024, i8* %1) + call void @llvm.lifetime.start.p0i8(i64 1024, i8* %1) %2 = getelementptr inbounds [256 x i32], [256 x i32]* %a1, i32 0, i32 0 call void @foo3(i32* %2) #3 call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"() - call void @llvm.lifetime.end(i64 1024, i8* %1) + call void @llvm.lifetime.end.p0i8(i64 1024, i8* %1) ret i32 0 } ; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) declare void @foo3(i32*) ; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/ARM/static-addr-hoisting.ll b/test/CodeGen/ARM/static-addr-hoisting.ll index 3d47e02f965e..683d607936b8 100644 --- a/test/CodeGen/ARM/static-addr-hoisting.ll +++ b/test/CodeGen/ARM/static-addr-hoisting.ll @@ -6,9 +6,9 @@ define void @multiple_store() { ; CHECK: movs [[VAL:r[0-9]+]], #42 ; CHECK: movt r[[BASE1]], #15 -; CHECK: str [[VAL]], [r[[BASE1]]] -; CHECK: str [[VAL]], [r[[BASE1]], #24] -; CHECK: str.w [[VAL]], [r[[BASE1]], #42] +; CHECK-DAG: str [[VAL]], [r[[BASE1]]] +; CHECK-DAG: str [[VAL]], [r[[BASE1]], #24] +; CHECK-DAG: str.w [[VAL]], [r[[BASE1]], #42] ; CHECK: movw r[[BASE2:[0-9]+]], #20394 ; CHECK: movt r[[BASE2]], #18 diff --git a/test/CodeGen/ARM/tail-opts.ll b/test/CodeGen/ARM/tail-opts.ll index 37e9a4af3be5..475b80b3bb07 100644 --- a/test/CodeGen/ARM/tail-opts.ll +++ b/test/CodeGen/ARM/tail-opts.ll @@ -65,3 +65,55 @@ altret: call void @far(i32 1001) ret void } + +; Use alternating abort functions so that the blocks we wish to merge are not +; layout successors during branch folding. + +; CHECK-LABEL: merge_alternating_aborts: +; CHECK-NOT: _abort +; CHECK-NOT: _alt_abort +; CHECK: bxne lr +; CHECK-NOT: _abort +; CHECK-NOT: _alt_abort +; CHECK: LBB{{.*}}: +; CHECK: mov lr, pc +; CHECK: b _alt_abort +; CHECK-NOT: _abort +; CHECK-NOT: _alt_abort +; CHECK: LBB{{.*}}: +; CHECK: mov lr, pc +; CHECK: b _abort +; CHECK-NOT: _abort +; CHECK-NOT: _alt_abort + +declare void @abort() +declare void @alt_abort() + +define void @merge_alternating_aborts() { +entry: + %c1 = call i1 @qux() + br i1 %c1, label %cont1, label %abort1 +abort1: + call void @abort() + unreachable +cont1: + %c2 = call i1 @qux() + br i1 %c2, label %cont2, label %abort2 +abort2: + call void @alt_abort() + unreachable +cont2: + %c3 = call i1 @qux() + br i1 %c3, label %cont3, label %abort3 +abort3: + call void @abort() + unreachable +cont3: + %c4 = call i1 @qux() + br i1 %c4, label %cont4, label %abort4 +abort4: + call void @alt_abort() + unreachable +cont4: + ret void +} diff --git a/test/CodeGen/ARM/thumb1-div.ll b/test/CodeGen/ARM/thumb1-div.ll new file mode 100644 index 000000000000..844dfe6f963c --- /dev/null +++ b/test/CodeGen/ARM/thumb1-div.ll @@ -0,0 +1,67 @@ +; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-m23 -march=thumb | \ +; RUN: FileCheck %s -check-prefix=CHECK + +define i32 @f1(i32 %a, i32 %b) { +entry: +; CHECK-LABEL: f1 + +; CHECK: sdiv + %tmp1 = sdiv i32 %a, %b ; [#uses=1] + ret i32 %tmp1 +} + +define i32 @f2(i32 %a, i32 %b) { +entry: +; CHECK-LABEL: f2 +; CHECK: udiv + %tmp1 = udiv i32 %a, %b ; [#uses=1] + ret i32 %tmp1 +} + +define i32 @f3(i32 %a, i32 %b) { +entry: +; CHECK-LABEL: f3 + + + %tmp1 = srem i32 %a, %b ; [#uses=1] + ret i32 %tmp1 +; CHECK: sdiv +; CHECK-NEXT: muls +; CHECK-NEXT: subs +} + +define i32 @f4(i32 %a, i32 %b) { +entry: +; CHECK-LABEL: f4 + +; CHECK: udiv +; CHECK-NEXT: muls +; CHECK-NEXT: subs + %tmp1 = urem i32 %a, %b ; [#uses=1] + ret i32 %tmp1 +} + + +define i64 @f5(i64 %a, i64 %b) { +entry: +; CHECK-LABEL: f5 + +; EABI MODE = Remainder in R2-R3, quotient in R0-R1 +; CHECK: __aeabi_ldivmod +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 + %tmp1 = srem i64 %a, %b ; [#uses=1] + ret i64 %tmp1 +} + +define i64 @f6(i64 %a, i64 %b) { +entry: +; CHECK-LABEL: f6 + +; EABI MODE = Remainder in R2-R3, quotient in R0-R1 +; CHECK: __aeabi_uldivmod +; CHECK: mov r0, r2 +; CHECK: mov r1, r3 + %tmp1 = urem i64 %a, %b ; [#uses=1] + ret i64 %tmp1 +} diff --git a/test/CodeGen/ARM/unschedule-first-call.ll b/test/CodeGen/ARM/unschedule-first-call.ll new file mode 100644 index 000000000000..4a218afcc5e1 --- /dev/null +++ b/test/CodeGen/ARM/unschedule-first-call.ll @@ -0,0 +1,136 @@ +; RUN: llc < %s +; PR30911 + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv6kz--linux-gnueabihf" + +; Function Attrs: nounwind +define void @dradbg(i32, i32, float*, float*, float*, float*, float*) #0 { + br i1 undef, label %.critedge, label %8 + +.critedge: ; preds = %7 + %.mux2 = select i1 undef, i1 undef, i1 true + br label %8 + +;