41 files changed, 14400 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/CellSPU/CellSDKIntrinsics.td b/contrib/llvm/lib/Target/CellSPU/CellSDKIntrinsics.td
new file mode 100644
index 000000000000..cdb4099ffbca
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/CellSDKIntrinsics.td
@@ -0,0 +1,449 @@
+//===-- CellSDKIntrinsics.td - Cell SDK Intrinsics ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+///--==-- Arithmetic ops intrinsics --==--
+def CellSDKah:
+    RR_Int_v8i16<0b00010011000, "ah", IntegerOp, int_spu_si_ah>;
+def CellSDKahi:
+    RI10_Int_v8i16<0b00010011000, "ahi", IntegerOp, int_spu_si_ahi>;
+def CellSDKa:
+    RR_Int_v4i32<0b00000011000, "a", IntegerOp, int_spu_si_a>;
+def CellSDKai:
+    RI10_Int_v4i32<0b00111000, "ai", IntegerOp, int_spu_si_ai>;
+def CellSDKsfh:
+    RR_Int_v8i16<0b00010010000, "sfh", IntegerOp, int_spu_si_sfh>;
+def CellSDKsfhi:
+    RI10_Int_v8i16<0b10110000, "sfhi", IntegerOp, int_spu_si_sfhi>;
+def CellSDKsf:
+    RR_Int_v4i32<0b00000010000, "sf", IntegerOp, int_spu_si_sf>;
+def CellSDKsfi:
+    RI10_Int_v4i32<0b00110000, "sfi", IntegerOp, int_spu_si_sfi>;
+def CellSDKaddx:
+    RR_Int_v4i32<0b00000010110, "addx", IntegerOp, int_spu_si_addx>;
+def CellSDKcg:
+    RR_Int_v4i32<0b0100001100, "cg", IntegerOp, int_spu_si_cg>;
+def CellSDKcgx:
+    RR_Int_v4i32<0b01000010110, "cgx", IntegerOp, int_spu_si_cgx>;
+def CellSDKsfx:
+    RR_Int_v4i32<0b10000010110, "sfx", IntegerOp, int_spu_si_sfx>;
+def CellSDKbg:
+    RR_Int_v4i32<0b01000010000, "bg", IntegerOp, int_spu_si_bg>;
+def CellSDKbgx:
+    RR_Int_v4i32<0b11000010110, "bgx", IntegerOp, int_spu_si_bgx>;
+
+def CellSDKmpy:
+    RRForm<0b00100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpy $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpy (v8i16 VECREG:$rA),
+                                                (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyu:
+    RRForm<0b00110011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyu $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyu (v8i16 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB)))] >;
+
+def CellSDKmpyi:
+    RI10Form<0b00101110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "mpyi $rT, $rA, $val", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyi (v8i16 VECREG:$rA),
+                                                 i16ImmSExt10:$val))]>;
+
+def CellSDKmpyui:
+    RI10Form<0b10101110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "mpyui $rT, $rA, $val", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyui (v8i16 VECREG:$rA),
+                                                  i16ImmSExt10:$val))]>;
+
+def CellSDKmpya:
+    RRRForm<0b0011, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "mpya $rT, $rA, $rB, $rC", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpya (v8i16 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB),
+                                                 (v8i16 VECREG:$rC)))]>;
+
+def CellSDKmpyh:
+    RRForm<0b10100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyh $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyh (v4i32 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpys:
+    RRForm<0b11100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpys $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpys (v8i16 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyhh:
+    RRForm<0b01100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhh $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhh (v8i16 VECREG:$rA),
+                                                  (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyhha:
+    RRForm<0b01100010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhha $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhha (v8i16 VECREG:$rA),
+                                                   (v8i16 VECREG:$rB)))]>;
+
+// Not sure how to match a (set $rT, (add $rT (mpyhh $rA, $rB)))... so leave
+// as an intrinsic for the time being
+def CellSDKmpyhhu:
+    RRForm<0b01110011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhhu $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhhu (v8i16 VECREG:$rA),
+                                                   (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyhhau:
+    RRForm<0b01110010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhhau $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhhau (v8i16 VECREG:$rA),
+                                                    (v8i16 VECREG:$rB)))]>;
+
+def CellSDKand:
+        RRForm<0b1000011000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "and\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_and (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKandc:
+        RRForm<0b10000011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "andc\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_andc (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKandbi:
+     RI10Form<0b01101000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+       "andbi\t $rT, $rA, $val", BranchResolv,
+       [(set (v16i8 VECREG:$rT),
+             (int_spu_si_andbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKandhi:
+     RI10Form<0b10101000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "andhi\t $rT, $rA, $val", BranchResolv,
+       [(set (v8i16 VECREG:$rT),
+             (int_spu_si_andhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKandi:
+     RI10Form<0b00101000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "andi\t $rT, $rA, $val", BranchResolv,
+       [(set (v4i32 VECREG:$rT),
+             (int_spu_si_andi (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKor:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "or\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_or (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKorc:
+        RRForm<0b10010011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "addc\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_orc (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKorbi:
+     RI10Form<0b01100000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+       "orbi\t $rT, $rA, $val", BranchResolv,
+       [(set (v16i8 VECREG:$rT),
+             (int_spu_si_orbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKorhi:
+     RI10Form<0b10100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "orhi\t $rT, $rA, $val", BranchResolv,
+       [(set (v8i16 VECREG:$rT),
+             (int_spu_si_orhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKori:
+     RI10Form<0b00100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "ori\t $rT, $rA, $val", BranchResolv,
+       [(set (v4i32 VECREG:$rT),
+             (int_spu_si_ori (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKxor:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "xor\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT), 
+                (int_spu_si_xor (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKxorbi:
+     RI10Form<0b01100000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+       "xorbi\t $rT, $rA, $val", BranchResolv,
+       [(set (v16i8 VECREG:$rT), (int_spu_si_xorbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKxorhi:
+     RI10Form<0b10100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+       "xorhi\t $rT, $rA, $val", BranchResolv,
+       [(set (v8i16 VECREG:$rT), 
+             (int_spu_si_xorhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKxori:
+     RI10Form<0b00100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "xori\t $rT, $rA, $val", BranchResolv,
+       [(set (v4i32 VECREG:$rT), 
+             (int_spu_si_xori (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKnor:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "nor\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT), 
+                (int_spu_si_nor (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKnand:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "nand\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT), 
+                (int_spu_si_nand (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+//===----------------------------------------------------------------------===//
+// Shift/rotate intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKshli:
+  Pat<(int_spu_si_shli (v4i32 VECREG:$rA), uimm7:$val),
+      (SHLIv4i32 VECREG:$rA, (TO_IMM32 imm:$val))>;
+
+def CellSDKshlqbi:
+  Pat<(int_spu_si_shlqbi VECREG:$rA, R32C:$rB),
+      (SHLQBIv16i8 VECREG:$rA, R32C:$rB)>;
+
+def CellSDKshlqii:
+  Pat<(int_spu_si_shlqbii VECREG:$rA, uimm7:$val),
+      (SHLQBIIv16i8 VECREG:$rA, (TO_IMM32 imm:$val))>;
+
+def CellSDKshlqby:
+  Pat<(int_spu_si_shlqby VECREG:$rA, R32C:$rB),
+      (SHLQBYv16i8 VECREG:$rA, R32C:$rB)>;
+
+def CellSDKshlqbyi:
+  Pat<(int_spu_si_shlqbyi VECREG:$rA, uimm7:$val),
+      (SHLQBYIv16i8 VECREG:$rA, (TO_IMM32 imm:$val))>;
+
+          
+//===----------------------------------------------------------------------===//
+// Branch/compare intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKceq:
+  RRForm<0b00000011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        "ceq\t $rT, $rA, $rB", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_ceq (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKceqi:
+  RI10Form<0b00111110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+        "ceqi\t $rT, $rA, $val", BranchResolv,
+    [(set (v4i32 VECREG:$rT), 
+          (int_spu_si_ceqi (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKceqb:
+  RRForm<0b00001011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        "ceqb\t $rT, $rA, $rB", BranchResolv,
+        [(set (v16i8 VECREG:$rT), 
+              (int_spu_si_ceqb (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)))]>;
+
+def CellSDKceqbi:
+  RI10Form<0b01111110, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+        "ceqbi\t $rT, $rA, $val", BranchResolv,
+    [(set (v16i8 VECREG:$rT), (int_spu_si_ceqbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKceqh:
+  RRForm<0b00010011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        "ceqh\t $rT, $rA, $rB", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_ceqh (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+
+def CellSDKceqhi:
+  RI10Form<0b10111110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+        "ceqhi\t $rT, $rA, $val", BranchResolv,
+    [(set (v8i16 VECREG:$rT), 
+          (int_spu_si_ceqhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+def CellSDKcgth:
+  RRForm<0b00010011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "cgth\t $rT, $rA, $rB", BranchResolv,
+        [(set (v8i16 VECREG:$rT),
+              (int_spu_si_cgth (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+
+def CellSDKcgthi:
+  RI10Form<0b10111110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "cgthi\t $rT, $rA, $val", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_cgthi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKcgt:
+  RRForm<0b00000010010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "cgt\t $rT, $rA, $rB", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_cgt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKcgti:
+  RI10Form<0b00110010, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "cgti\t $rT, $rA, $val", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_cgti (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKcgtb:
+  RRForm<0b00001010010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "cgtb\t $rT, $rA, $rB", BranchResolv,
+        [(set (v16i8 VECREG:$rT), 
+              (int_spu_si_cgtb (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)))]>;
+
+def CellSDKcgtbi:
+  RI10Form<0b01110010, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+    "cgtbi\t $rT, $rA, $val", BranchResolv,
+        [(set (v16i8 VECREG:$rT), (int_spu_si_cgtbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKclgth:
+  RRForm<0b00010011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "clgth\t $rT, $rA, $rB", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_clgth (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+
+def CellSDKclgthi:
+  RI10Form<0b10111010, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "clgthi\t $rT, $rA, $val", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_clgthi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKclgt:
+  RRForm<0b00000011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "clgt\t $rT, $rA, $rB", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_clgt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKclgti:
+  RI10Form<0b00111010, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "clgti\t $rT, $rA, $val", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_clgti (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKclgtb:
+  RRForm<0b00001011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "clgtb\t $rT, $rA, $rB", BranchResolv,
+    [(set (v16i8 VECREG:$rT),
+          (int_spu_si_clgtb (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)))]>;
+
+def CellSDKclgtbi:
+  RI10Form<0b01111010, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+    "clgtbi\t $rT, $rA, $val", BranchResolv,
+    [(set (v16i8 VECREG:$rT),
+          (int_spu_si_clgtbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKfa:
+  RRForm<0b00100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fa\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fa (v4f32 VECREG:$rA),
+                                                 (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfs:
+  RRForm<0b10100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fs\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fs (v4f32 VECREG:$rA),
+                                                 (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfm:
+  RRForm<0b01100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fm\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fm (v4f32 VECREG:$rA),
+                                                 (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfceq:
+  RRForm<0b01000011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fceq\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fceq (v4f32 VECREG:$rA),
+                                                   (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfcgt:
+  RRForm<0b01000011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fcgt\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fcgt (v4f32 VECREG:$rA),
+                                                   (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfcmeq:
+  RRForm<0b01010011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fcmeq\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fcmeq (v4f32 VECREG:$rA),
+                                                    (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfcmgt:
+  RRForm<0b01010011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fcmgt\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fcmgt (v4f32 VECREG:$rA),
+                                                    (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfma:
+  RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+    "fma\t $rT, $rA, $rB, $rC", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fma (v4f32 VECREG:$rA),
+                                                  (v4f32 VECREG:$rB),
+                                                  (v4f32 VECREG:$rC)))]>;
+
+def CellSDKfnms:
+  RRRForm<0b1011, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+    "fnms\t $rT, $rA, $rB, $rC", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fnms (v4f32 VECREG:$rA),
+                                                   (v4f32 VECREG:$rB),
+                                                   (v4f32 VECREG:$rC)))]>;
+
+def CellSDKfms:
+  RRRForm<0b1111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+    "fms\t $rT, $rA, $rB, $rC", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fms (v4f32 VECREG:$rA),
+                                                  (v4f32 VECREG:$rB),
+                                                  (v4f32 VECREG:$rC)))]>;
+
+//===----------------------------------------------------------------------===//
+// Double precision floating-point intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKdfa:
+  RRForm<0b00110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfa\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfa (v2f64 VECREG:$rA),
+                                                  (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfs:
+  RRForm<0b10110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfs\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfs (v2f64 VECREG:$rA),
+                                                  (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfm:
+  RRForm<0b01110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfm\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfm (v2f64 VECREG:$rA),
+                                                  (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfma:
+  RRForm<0b00111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfma\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfma (v2f64 VECREG:$rA),
+                                                   (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfnma:
+  RRForm<0b11111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfnma\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfnma (v2f64 VECREG:$rA),
+                                                    (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfnms:
+  RRForm<0b01111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfnms\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfnms (v2f64 VECREG:$rA),
+                                                    (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfms:
+  RRForm<0b10111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfms\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfms (v2f64 VECREG:$rA),
+                                                   (v2f64 VECREG:$rB)))]>;
diff --git a/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCAsmInfo.cpp
new file mode 100644
index 000000000000..4bad37eacaf7
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCAsmInfo.cpp
@@ -0,0 +1,43 @@
+//===-- SPUMCAsmInfo.cpp - Cell SPU asm properties ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SPUMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUMCAsmInfo.h"
+using namespace llvm;
+
+void SPULinuxMCAsmInfo::anchor() { }
+
+SPULinuxMCAsmInfo::SPULinuxMCAsmInfo(const Target &T, StringRef TT) {
+  IsLittleEndian = false;
+
+  ZeroDirective = "\t.space\t";
+  Data64bitsDirective = "\t.quad\t";
+  AlignmentIsInBytes = false;
+      
+  PCSymbol = ".";
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".L";
+
+  // Has leb128
+  HasLEB128 = true;
+
+  SupportsDebugInformation = true;
+
+  // Exception handling is not supported on CellSPU (think about it: you only
+  // have 256K for code+data. Would you support exception handling?)
+  ExceptionsType = ExceptionHandling::None;
+
+  // SPU assembly requires ".section" before ".bss" 
+  UsesELFSectionDirectiveForBSS = true;  
+}
+
diff --git a/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCAsmInfo.h b/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCAsmInfo.h
new file mode 100644
index 000000000000..f786147b9267
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- SPUMCAsmInfo.h - Cell SPU asm properties ---------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SPUMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUTARGETASMINFO_H
+#define SPUTARGETASMINFO_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  
+  class SPULinuxMCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit SPULinuxMCAsmInfo(const Target &T, StringRef TT);
+  };
+} // namespace llvm
+
+#endif /* SPUTARGETASMINFO_H */
diff --git a/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
new file mode 100644
index 000000000000..8450e2c6634c
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
@@ -0,0 +1,94 @@
+//===-- SPUMCTargetDesc.cpp - Cell SPU Target Descriptions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Cell SPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUMCTargetDesc.h"
+#include "SPUMCAsmInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "SPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SPUGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SPUGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createSPUMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitSPUMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createCellSPUMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSPUMCRegisterInfo(X, SPU::R0);
+  return X;
+}
+
+static MCSubtargetInfo *createSPUMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                 StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitSPUMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createSPUMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new SPULinuxMCAsmInfo(T, TT);
+
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(SPU::R1, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createSPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  // For the time being, use static relocations, since there's really no
+  // support for PIC yet.
+  X->InitMCCodeGenInfo(Reloc::Static, CM, OL);
+  return X;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeCellSPUTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheCellSPUTarget, createSPUMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheCellSPUTarget,
+                                        createSPUMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheCellSPUTarget, createSPUMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheCellSPUTarget,
+                                    createCellSPUMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheCellSPUTarget,
+                                          createSPUMCSubtargetInfo);
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h b/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
new file mode 100644
index 000000000000..d26449e8908f
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
@@ -0,0 +1,38 @@
+//===-- SPUMCTargetDesc.h - CellSPU Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides CellSPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUMCTARGETDESC_H
+#define SPUMCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheCellSPUTarget;
+
+} // End llvm namespace
+
+// Define symbolic names for Cell registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "SPUGenRegisterInfo.inc"
+
+// Defines symbolic names for the SPU instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "SPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SPUGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPU.h b/contrib/llvm/lib/Target/CellSPU/SPU.h
new file mode 100644
index 000000000000..c660131706cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPU.h
@@ -0,0 +1,31 @@
+//===-- SPU.h - Top-level interface for Cell SPU Target ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Cell SPU back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_IBMCELLSPU_H
+#define LLVM_TARGET_IBMCELLSPU_H
+
+#include "MCTargetDesc/SPUMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class SPUTargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+
+  FunctionPass *createSPUISelDag(SPUTargetMachine &TM);
+  FunctionPass *createSPUNopFillerPass(SPUTargetMachine &tm);
+
+}
+
+#endif /* LLVM_TARGET_IBMCELLSPU_H */
diff --git a/contrib/llvm/lib/Target/CellSPU/SPU.td b/contrib/llvm/lib/Target/CellSPU/SPU.td
new file mode 100644
index 000000000000..e835b9cac8e1
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPU.td
@@ -0,0 +1,66 @@
+//===-- SPU.td - Describe the STI Cell SPU Target Machine --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the STI Cell SPU target machine.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "llvm/Target/Target.td"
+
+// Holder of code fragments (you'd think this'd already be in
+// a td file somewhere... :-)
+
+class CodeFrag<dag frag> {
+  dag Fragment = frag;
+}
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "SPURegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction formats, instructions
+//===----------------------------------------------------------------------===//
+
+include "SPUNodes.td"
+include "SPUOperands.td"
+include "SPUSchedule.td"
+include "SPUInstrFormats.td"
+include "SPUInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget features:
+//===----------------------------------------------------------------------===//
+
+def DefaultProc: SubtargetFeature<"", "ProcDirective", "SPU::DEFAULT_PROC", "">;
+def LargeMemFeature:
+  SubtargetFeature<"large_mem","UseLargeMem", "true",
+                   "Use large (>256) LSA memory addressing [default = false]">;
+
+def SPURev0 : Processor<"v0", SPUItineraries, [DefaultProc]>;
+
+//===----------------------------------------------------------------------===//
+// Calling convention:
+//===----------------------------------------------------------------------===//
+
+include "SPUCallingConv.td"
+
+// Target:
+
+def SPUInstrInfo : InstrInfo {
+  let isLittleEndianEncoding = 1;
+}
+
+def SPU : Target {
+  let InstructionSet = SPUInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPU128InstrInfo.td b/contrib/llvm/lib/Target/CellSPU/SPU128InstrInfo.td
new file mode 100644
index 000000000000..e051e047333a
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPU128InstrInfo.td
@@ -0,0 +1,41 @@
+//===-- SPU128InstrInfo.td - Cell SPU 128-bit operations --*- tablegen -*--===//
+//
+//                     Cell SPU 128-bit operations
+//
+//===----------------------------------------------------------------------===//
+
+// zext 32->128: Zero extend 32-bit to 128-bit
+def : Pat<(i128 (zext R32C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 R32C:$rSrc, 12)>;
+
+// zext 64->128: Zero extend 64-bit to 128-bit
+def : Pat<(i128 (zext R64C:$rSrc)),
+          (ROTQMBYIr128_zext_r64 R64C:$rSrc, 8)>;
+
+// zext 16->128: Zero extend 16-bit to 128-bit
+def : Pat<(i128 (zext R16C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff)), 12)>;
+
+// zext 8->128: Zero extend 8-bit to 128-bit
+def : Pat<(i128 (zext R8C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDIi8i32 R8C:$rSrc, 0xf), 12)>;
+
+// anyext 32->128: Zero extend 32-bit to 128-bit
+def : Pat<(i128 (anyext R32C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 R32C:$rSrc, 12)>;
+
+// anyext 64->128: Zero extend 64-bit to 128-bit
+def : Pat<(i128 (anyext R64C:$rSrc)),
+          (ROTQMBYIr128_zext_r64 R64C:$rSrc, 8)>;
+
+// anyext 16->128: Zero extend 16-bit to 128-bit
+def : Pat<(i128 (anyext R16C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff)), 12)>;
+
+// anyext 8->128: Zero extend 8-bit to 128-bit
+def : Pat<(i128 (anyext R8C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDIi8i32 R8C:$rSrc, 0xf), 12)>;
+
+// Shift left
+def : Pat<(shl GPRC:$rA, R32C:$rB),
+          (SHLQBYBIr128 (SHLQBIr128 GPRC:$rA, R32C:$rB), R32C:$rB)>;
diff --git a/contrib/llvm/lib/Target/CellSPU/SPU64InstrInfo.td b/contrib/llvm/lib/Target/CellSPU/SPU64InstrInfo.td
new file mode 100644
index 000000000000..bea33b5362d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPU64InstrInfo.td
@@ -0,0 +1,408 @@
+//====-- SPU64InstrInfo.td - Cell SPU 64-bit operations ---*- tablegen -*--===//
+//
+//                     Cell SPU 64-bit operations
+//
+//===----------------------------------------------------------------------===//
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// 64-bit comparisons:
+//
+// 1. The instruction sequences for vector vice scalar differ by a
+//    constant. In the scalar case, we're only interested in the
+//    top two 32-bit slots, whereas we're interested in an exact
+//    all-four-slot match in the vector case.
+//
+// 2. There are no "immediate" forms, since loading 64-bit constants
+//    could be a constant pool load.
+//
+// 3. i64 setcc results are i32, which are subsequently converted to a FSM
+//    mask when used in a select pattern.
+//
+// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask (TODO)
+//    [Note: this may be moot, since gb produces v4i32 or r32.]
+//
+// 5. The code sequences for r64 and v2i64 are probably overly conservative,
+//    compared to the code that gcc produces.
+//
+// M00$E B!tes Kan be Pretty N@sTi!!!!! (apologies to Monty!)
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// selb instruction definition for i64. Note that the selection mask is
+// a vector, produced by various forms of FSM:
+def SELBr64_cond:
+  SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
+           [/* no pattern */]>;
+
+// The generic i64 select pattern, which assumes that the comparison result
+// is in a 32-bit register that contains a select mask pattern (i.e., gather
+// bits result):
+
+def : Pat<(select R32C:$rCond, R64C:$rFalse, R64C:$rTrue),
+          (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 R32C:$rCond))>;
+
+// select the negative condition:
+class I64SELECTNegCond<PatFrag cond, CodeFrag compare>:
+  Pat<(select (i32 (cond R64C:$rA, R64C:$rB)), R64C:$rTrue, R64C:$rFalse),
+      (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 compare.Fragment))>;
+
+// setcc the negative condition:
+class I64SETCCNegCond<PatFrag cond, CodeFrag compare>:
+  Pat<(cond R64C:$rA, R64C:$rB),
+      (XORIr32 compare.Fragment, -1)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// The i64 seteq fragment that does the scalar->vector conversion and
+// comparison:
+def CEQr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG),
+                                           (COPY_TO_REGCLASS R64C:$rB, VECREG))), 0xb)>;
+
+// The i64 seteq fragment that does the vector comparison
+def CEQv2i64compare:
+    CodeFrag<(CEQIv4i32 (GBv4i32 (CEQv4i32 VECREG:$rA, VECREG:$rB)), 0xf)>;
+
+// i64 seteq (equality): the setcc result is i32, which is converted to a
+// vector FSM mask when used in a select pattern.
+//
+// v2i64 seteq (equality): the setcc result is v4i32
+multiclass CompareEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CEQr64compare.Fragment, R32C))>;
+  def v2i64: CodeFrag<(i32 (COPY_TO_REGCLASS CEQv2i64compare.Fragment, R32C))>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS 
+                               (FSMv4i32 CEQr64compare.Fragment), R32C))>;
+  def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS 
+                               (FSMv4i32 CEQv2i64compare.Fragment), R32C))>;
+}
+
+defm I64EQ: CompareEqual64;
+
+def : Pat<(seteq R64C:$rA, R64C:$rB), I64EQr64.Fragment>;
+def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)), I64EQv2i64.Fragment>;
+
+// i64 setne:
+def : I64SETCCNegCond<setne, I64EQr64>;
+def : I64SELECTNegCond<setne, I64EQr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setugt/setule:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def CLGTr64ugt:
+    CodeFrag<(CLGTv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), 
+                        (COPY_TO_REGCLASS R64C:$rB, VECREG))>;
+
+def CLGTr64eq:
+    CodeFrag<(CEQv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), 
+                       (COPY_TO_REGCLASS R64C:$rB, VECREG))>;
+    
+def CLGTr64compare:
+    CodeFrag<(SELBv2i64 CLGTr64ugt.Fragment,
+                        (XSWDv2i64 CLGTr64ugt.Fragment),
+                        CLGTr64eq.Fragment)>;
+
+def CLGTv2i64ugt:
+    CodeFrag<(CLGTv4i32 VECREG:$rA, VECREG:$rB)>;
+
+def CLGTv2i64eq:
+    CodeFrag<(CEQv4i32 VECREG:$rA, VECREG:$rB)>;
+    
+def CLGTv2i64compare:
+    CodeFrag<(SELBv2i64 CLGTv2i64ugt.Fragment,
+                        (XSWDv2i64 CLGTr64ugt.Fragment),
+                        CLGTv2i64eq.Fragment)>;
+
+multiclass CompareLogicalGreaterThan64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CLGTr64compare.Fragment, R32C))>;
+  def v2i64: CodeFrag<CLGTv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS 
+                               (FSMv4i32 CLGTr64compare.Fragment), R32C))>;
+  def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS 
+                               (FSMv4i32 CLGTv2i64compare.Fragment), R32C))>;
+}
+
+defm I64LGT: CompareLogicalGreaterThan64;
+
+def : Pat<(setugt R64C:$rA, R64C:$rB), I64LGTr64.Fragment>;
+//def : Pat<(setugt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+//          I64LGTv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setule, I64LGTr64>;
+def : I64SELECTNegCond<setule, I64LGTr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setuge/setult:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def CLGEr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (ORv4i32 CLGTr64ugt.Fragment,
+                                          CLGTr64eq.Fragment)), 0xb)>;
+
+def CLGEv2i64compare:
+    CodeFrag<(CEQIv4i32 (GBv4i32 (ORv4i32 CLGTv2i64ugt.Fragment,
+                                          CLGTv2i64eq.Fragment)), 0xf)>;
+
+multiclass CompareLogicalGreaterEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CLGEr64compare.Fragment, R32C))>;
+  def v2i64: CodeFrag<CLGEv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS 
+                           (FSMv4i32 CLGEr64compare.Fragment), R32C))>;
+  def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS 
+                           (FSMv4i32 CLGEv2i64compare.Fragment),R32C))>;
+}
+
+defm I64LGE: CompareLogicalGreaterEqual64;
+
+def : Pat<(setuge R64C:$rA, R64C:$rB), I64LGEr64.Fragment>;
+def : Pat<(v2i64 (setuge (v2i64 VECREG:$rA), (v2i64 VECREG:$rB))),
+          I64LGEv2i64.Fragment>;
+                  
+
+// i64 setult:
+def : I64SETCCNegCond<setult, I64LGEr64>;
+def : I64SELECTNegCond<setult, I64LGEr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setgt/setle:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def CGTr64sgt:
+    CodeFrag<(CGTv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), 
+                       (COPY_TO_REGCLASS R64C:$rB, VECREG))>;
+
+def CGTr64eq:
+    CodeFrag<(CEQv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), 
+                       (COPY_TO_REGCLASS R64C:$rB, VECREG))>;
+    
+def CGTr64compare:
+    CodeFrag<(SELBv2i64 CGTr64sgt.Fragment,
+                        (XSWDv2i64 CGTr64sgt.Fragment),
+                        CGTr64eq.Fragment)>;
+
+def CGTv2i64sgt:
+    CodeFrag<(CGTv4i32 VECREG:$rA, VECREG:$rB)>;
+
+def CGTv2i64eq:
+    CodeFrag<(CEQv4i32 VECREG:$rA, VECREG:$rB)>;
+    
+def CGTv2i64compare:
+    CodeFrag<(SELBv2i64 CGTv2i64sgt.Fragment,
+                        (XSWDv2i64 CGTr64sgt.Fragment),
+                        CGTv2i64eq.Fragment)>;
+
+multiclass CompareGreaterThan64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CGTr64compare.Fragment, R32C))>;
+  def v2i64: CodeFrag<CGTv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS 
+                             (FSMv4i32 CGTr64compare.Fragment), R32C))>;
+  def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS 
+                               (FSMv4i32 CGTv2i64compare.Fragment), R32C))>;
+}
+
+defm I64GT: CompareLogicalGreaterThan64;
+
+def : Pat<(setgt R64C:$rA, R64C:$rB), I64GTr64.Fragment>;
+//def : Pat<(setgt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+//                  I64GTv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setle, I64GTr64>;
+def : I64SELECTNegCond<setle, I64GTr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setge/setlt:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+    
+def CGEr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (ORv4i32 CGTr64sgt.Fragment,
+                                          CGTr64eq.Fragment)), 0xb)>;
+
+def CGEv2i64compare:
+    CodeFrag<(CEQIv4i32 (GBv4i32 (ORv4i32 CGTv2i64sgt.Fragment,
+                                          CGTv2i64eq.Fragment)), 0xf)>;
+
+multiclass CompareGreaterEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CGEr64compare.Fragment, R32C))>;
+  def v2i64: CodeFrag<CGEv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS (FSMv4i32 CGEr64compare.Fragment),R32C))>;
+  def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS (FSMv4i32 CGEv2i64compare.Fragment),R32C))>;
+}
+
+defm I64GE: CompareGreaterEqual64;
+
+def : Pat<(setge R64C:$rA, R64C:$rB), I64GEr64.Fragment>;
+def : Pat<(v2i64 (setge (v2i64 VECREG:$rA), (v2i64 VECREG:$rB))),
+          I64GEv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setlt, I64GEr64>;
+def : I64SELECTNegCond<setlt, I64GEr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 add
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_add_cg<dag lhs, dag rhs>:
+    CodeFrag<(CGv4i32 lhs, rhs)>;
+
+class v2i64_add_1<dag lhs, dag rhs, dag cg, dag cg_mask>:
+    CodeFrag<(ADDXv4i32 lhs, rhs, (SHUFBv4i32 cg, cg, cg_mask))>;
+
+class v2i64_add<dag lhs, dag rhs, dag cg_mask>:
+    v2i64_add_1<lhs, rhs, v2i64_add_cg<lhs, rhs>.Fragment, cg_mask>;
+
+def : Pat<(SPUadd64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+           (COPY_TO_REGCLASS v2i64_add<(COPY_TO_REGCLASS R64C:$rA, VECREG),
+                                  (COPY_TO_REGCLASS R64C:$rB, VECREG),
+                                  (v4i32 VECREG:$rCGmask)>.Fragment, R64C)>;
+
+def : Pat<(SPUadd64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+           v2i64_add<(v2i64 VECREG:$rA),
+                     (v2i64 VECREG:$rB),
+                     (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 subtraction
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_sub_bg<dag lhs, dag rhs>: CodeFrag<(BGv4i32 lhs, rhs)>;
+
+class v2i64_sub<dag lhs, dag rhs, dag bg, dag bg_mask>:
+    CodeFrag<(SFXv4i32 lhs, rhs, (SHUFBv4i32 bg, bg, bg_mask))>;
+
+def : Pat<(SPUsub64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+           (COPY_TO_REGCLASS 
+               v2i64_sub<(COPY_TO_REGCLASS R64C:$rA, VECREG),
+                         (COPY_TO_REGCLASS R64C:$rB, VECREG),
+                         v2i64_sub_bg<(COPY_TO_REGCLASS R64C:$rA, VECREG),
+                                      (COPY_TO_REGCLASS R64C:$rB, VECREG)>.Fragment,
+                                  (v4i32 VECREG:$rCGmask)>.Fragment, R64C)>;
+
+def : Pat<(SPUsub64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+           v2i64_sub<(v2i64 VECREG:$rA),
+                     (v2i64 VECREG:$rB),
+                     v2i64_sub_bg<(v2i64 VECREG:$rA),
+                                  (v2i64 VECREG:$rB)>.Fragment,
+                     (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 multiply
+//
+// Note: i64 multiply is simply the vector->scalar conversion of the
+// full-on v2i64 multiply, since the entire vector has to be manipulated
+// anyway.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_mul_ahi64<dag rA> :
+    CodeFrag<(SELBv4i32 rA, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
+
+class v2i64_mul_bhi64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
+
+class v2i64_mul_alo64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
+
+class v2i64_mul_blo64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
+
+class v2i64_mul_ashlq2<dag rA>:
+    CodeFrag<(SHLQBYIv4i32 rA, 0x2)>;
+
+class v2i64_mul_ashlq4<dag rA>:
+    CodeFrag<(SHLQBYIv4i32 rA, 0x4)>;
+
+class v2i64_mul_bshlq2<dag rB> :
+    CodeFrag<(SHLQBYIv4i32 rB, 0x2)>;
+
+class v2i64_mul_bshlq4<dag rB> :
+    CodeFrag<(SHLQBYIv4i32 rB, 0x4)>;
+
+class v2i64_highprod<dag rA, dag rB>:
+    CodeFrag<(Av4i32
+                (Av4i32
+                  (MPYUv4i32 v2i64_mul_bshlq4<rB>.Fragment,     // a1 x b3
+                             v2i64_mul_ahi64<rA>.Fragment),
+                  (MPYHv4i32 v2i64_mul_ahi64<rA>.Fragment,      // a0 x b3
+                             v2i64_mul_bshlq4<rB>.Fragment)),
+                (Av4i32
+                  (MPYHv4i32 v2i64_mul_bhi64<rB>.Fragment,
+                             v2i64_mul_ashlq4<rA>.Fragment),
+                  (Av4i32
+                      (MPYHv4i32 v2i64_mul_ashlq4<rA>.Fragment,
+                                 v2i64_mul_bhi64<rB>.Fragment),
+                    (Av4i32
+                      (MPYUv4i32 v2i64_mul_ashlq4<rA>.Fragment,
+                                 v2i64_mul_bhi64<rB>.Fragment),
+                      (Av4i32
+                        (MPYHv4i32 v2i64_mul_ashlq2<rA>.Fragment,
+                                   v2i64_mul_bshlq2<rB>.Fragment),
+                        (MPYUv4i32 v2i64_mul_ashlq2<rA>.Fragment,
+                                   v2i64_mul_bshlq2<rB>.Fragment))))))>;
+
+class v2i64_mul_a3_b3<dag rA, dag rB>:
+    CodeFrag<(MPYUv4i32 v2i64_mul_alo64<rA>.Fragment,
+                        v2i64_mul_blo64<rB>.Fragment)>;
+
+class v2i64_mul_a2_b3<dag rA, dag rB>:
+    CodeFrag<(SELBv4i32 (SHLQBYIv4i32
+                          (MPYHHUv4i32 v2i64_mul_alo64<rA>.Fragment,
+                                       v2i64_mul_bshlq2<rB>.Fragment), 0x2),
+                        (ILv4i32 0),
+                        (FSMBIv4i32 0xc3c3))>;
+
+class v2i64_mul_a3_b2<dag rA, dag rB>:
+    CodeFrag<(SELBv4i32 (SHLQBYIv4i32
+                          (MPYHHUv4i32 v2i64_mul_blo64<rB>.Fragment,
+                                       v2i64_mul_ashlq2<rA>.Fragment), 0x2),
+                        (ILv4i32 0),
+                        (FSMBIv4i32 0xc3c3))>;
+
+class v2i64_lowsum<dag rA, dag rB, dag rCGmask>:
+    v2i64_add<v2i64_add<v2i64_mul_a3_b3<rA, rB>.Fragment,
+                        v2i64_mul_a2_b3<rA, rB>.Fragment, rCGmask>.Fragment,
+              v2i64_mul_a3_b2<rA, rB>.Fragment, rCGmask>;
+
+class v2i64_mul<dag rA, dag rB, dag rCGmask>:
+    v2i64_add<v2i64_lowsum<rA, rB, rCGmask>.Fragment,
+              (SELBv4i32 v2i64_highprod<rA, rB>.Fragment,
+                         (ILv4i32 0),
+                         (FSMBIv4i32 0x0f0f)),
+              rCGmask>;
+
+def : Pat<(SPUmul64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+          (COPY_TO_REGCLASS v2i64_mul<(COPY_TO_REGCLASS R64C:$rA, VECREG),
+                                 (COPY_TO_REGCLASS R64C:$rB, VECREG),
+                                 (v4i32 VECREG:$rCGmask)>.Fragment, R64C)>;
+
+def : Pat<(SPUmul64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+          v2i64_mul<(v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// f64 comparisons
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// selb instruction definition for i64. Note that the selection mask is
+// a vector, produced by various forms of FSM:
+def SELBf64_cond:
+   SELBInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB, R32C:$rC),
+            [(set R64FP:$rT,
+                  (select R32C:$rC, R64FP:$rB, R64FP:$rA))]>;
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUAsmPrinter.cpp b/contrib/llvm/lib/Target/CellSPU/SPUAsmPrinter.cpp
new file mode 100644
index 000000000000..14021fef05d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUAsmPrinter.cpp
@@ -0,0 +1,333 @@
+//===-- SPUAsmPrinter.cpp - Print machine instrs to Cell SPU assembly -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Cell SPU assembly language. This printer
+// is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asmprinter"
+#include "SPU.h"
+#include "SPUTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  class SPUAsmPrinter : public AsmPrinter {
+  public:
+    explicit SPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) :
+      AsmPrinter(TM, Streamer) {}
+
+    virtual const char *getPassName() const {
+      return "STI CBEA SPU Assembly Printer";
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.
+    void printInstruction(const MachineInstr *MI, raw_ostream &OS);
+    static const char *getRegisterName(unsigned RegNo);
+
+
+    void EmitInstruction(const MachineInstr *MI) {
+      SmallString<128> Str;
+      raw_svector_ostream OS(Str);
+      printInstruction(MI, OS);
+      OutStreamer.EmitRawText(OS.str());
+    }
+    void printOp(const MachineOperand &MO, raw_ostream &OS);
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.isReg()) {
+        O << getRegisterName(MO.getReg());
+      } else if (MO.isImm()) {
+        O << MO.getImm();
+      } else {
+        printOp(MO, O);
+      }
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O);
+
+
+    void
+    printU7ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
+    {
+      unsigned int value = MI->getOperand(OpNo).getImm();
+      assert(value < (1 << 8) && "Invalid u7 argument");
+      O << value;
+    }
+
+    void
+    printShufAddr(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
+    {
+      char value = MI->getOperand(OpNo).getImm();
+      O << (int) value;
+      O << "(";
+      printOperand(MI, OpNo+1, O);
+      O << ")";
+    }
+
+    void
+    printS16ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
+    {
+      O << (short) MI->getOperand(OpNo).getImm();
+    }
+
+    void
+    printU16ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
+    {
+      O << (unsigned short)MI->getOperand(OpNo).getImm();
+    }
+
+    void
+    printMemRegReg(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      // When used as the base register, r0 reads constant zero rather than
+      // the value contained in the register.  For this reason, the darwin
+      // assembler requires that we print r0 as 0 (no r) when used as the base.
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      O << getRegisterName(MO.getReg()) << ", ";
+      printOperand(MI, OpNo+1, O);
+    }
+
+    void
+    printU18ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
+    {
+      unsigned int value = MI->getOperand(OpNo).getImm();
+      assert(value <= (1 << 19) - 1 && "Invalid u18 argument");
+      O << value;
+    }
+
+    void
+    printS10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
+    {
+      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
+                             >> 16);
+      assert((value >= -(1 << 9) && value <= (1 << 9) - 1)
+             && "Invalid s10 argument");
+      O << value;
+    }
+
+    void
+    printU10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
+    {
+      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
+                             >> 16);
+      assert((value <= (1 << 10) - 1) && "Invalid u10 argument");
+      O << value;
+    }
+
+    void
+    printDFormAddr(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
+    {
+      assert(MI->getOperand(OpNo).isImm() &&
+             "printDFormAddr first operand is not immediate");
+      int64_t value = int64_t(MI->getOperand(OpNo).getImm());
+      int16_t value16 = int16_t(value);
+      assert((value16 >= -(1 << (9+4)) && value16 <= (1 << (9+4)) - 1)
+             && "Invalid dform s10 offset argument");
+      O << (value16 & ~0xf) << "(";
+      printOperand(MI, OpNo+1, O);
+      O << ")";
+    }
+
+    void
+    printAddr256K(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
+    {
+      /* Note: operand 1 is an offset or symbol name. */
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo, O);
+      } else {
+        printOp(MI->getOperand(OpNo), O);
+        if (MI->getOperand(OpNo+1).isImm()) {
+          int displ = int(MI->getOperand(OpNo+1).getImm());
+          if (displ > 0)
+            O << "+" << displ;
+          else if (displ < 0)
+            O << displ;
+        }
+      }
+    }
+
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      printOp(MI->getOperand(OpNo), O);
+    }
+
+    void printHBROperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      printOp(MI->getOperand(OpNo), O);
+    }
+
+    void printPCRelativeOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      // Used to generate a ".-<target>", but it turns out that the assembler
+      // really wants the target.
+      //
+      // N.B.: This operand is used for call targets. Branch hints are another
+      // animal entirely.
+      printOp(MI->getOperand(OpNo), O);
+    }
+
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo, O);
+      } else {
+        printOp(MI->getOperand(OpNo), O);
+        O << "@h";
+      }
+    }
+
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo, O);
+      } else {
+        printOp(MI->getOperand(OpNo), O);
+        O << "@l";
+      }
+    }
+
+    /// Print local store address
+    void printSymbolLSA(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      printOp(MI->getOperand(OpNo), O);
+    }
+
+    void printROTHNeg7Imm(const MachineInstr *MI, unsigned OpNo,
+                          raw_ostream &O) {
+      if (MI->getOperand(OpNo).isImm()) {
+        int value = (int) MI->getOperand(OpNo).getImm();
+        assert((value >= 0 && value < 16)
+               && "Invalid negated immediate rotate 7-bit argument");
+        O << -value;
+      } else {
+        llvm_unreachable("Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+      }
+    }
+
+    void printROTNeg7Imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O){
+      assert(MI->getOperand(OpNo).isImm() &&
+             "Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+      int value = (int) MI->getOperand(OpNo).getImm();
+      assert((value >= 0 && value <= 32)
+             && "Invalid negated immediate rotate 7-bit argument");
+      O << -value;
+    }
+  };
+} // end of anonymous namespace
+
+// Include the auto-generated portion of the assembly writer
+#include "SPUGenAsmWriter.inc"
+
+void SPUAsmPrinter::printOp(const MachineOperand &MO, raw_ostream &O) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    report_fatal_error("printOp() does not handle immediate values");
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    // Computing the address of an external symbol, not calling it.
+    if (TM.getRelocationModel() != Reloc::Static) {
+      O << "L" << MAI->getGlobalPrefix() << MO.getSymbolName()
+        << "$non_lazy_ptr";
+      return;
+    }
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    // External or weakly linked global variables need non-lazily-resolved
+    // stubs
+    if (TM.getRelocationModel() != Reloc::Static) {
+      const GlobalValue *GV = MO.getGlobal();
+      if (((GV->isDeclaration() || GV->hasWeakLinkage() ||
+            GV->hasLinkOnceLinkage() || GV->hasCommonLinkage()))) {
+        O << *GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        return;
+      }
+    }
+    O << *Mang->getSymbol(MO.getGlobal());
+    return;
+  case MachineOperand::MO_MCSymbol:
+    O << *(MO.getMCSymbol());
+    return;
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+bool SPUAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo, unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printMemRegReg(MI, OpNo, O);
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeCellSPUAsmPrinter() { 
+  RegisterAsmPrinter<SPUAsmPrinter> X(TheCellSPUTarget);
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUCallingConv.td b/contrib/llvm/lib/Target/CellSPU/SPUCallingConv.td
new file mode 100644
index 000000000000..9f9692bf67fe
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUCallingConv.td
@@ -0,0 +1,57 @@
+//===- SPUCallingConv.td - Calling Conventions for CellSPU -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the STI Cell SPU architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Return-value convention for Cell SPU: return value to be passed in reg 3-74
+def RetCC_SPU : CallingConv<[
+  CCIfType<[i8,i16,i32,i64,i128,f32,f64,v16i8,v8i16,v4i32,v2i64,v4f32,v2f64],
+  CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                 R66, R67, R68, R69, R70, R71, R72, R73, R74]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// CellSPU Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CCC_SPU : CallingConv<[
+  CCIfType<[i8, i16, i32, i64, i128, f32, f64, 
+            v16i8, v8i16, v4i32, v4f32, v2i64, v2f64],
+            CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                           R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                           R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                           R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                           R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                           R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                           R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                           R66, R67, R68, R69, R70, R71, R72, R73, R74]>>,
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToStack<16, 16>>
+]>;
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUFrameLowering.cpp b/contrib/llvm/lib/Target/CellSPU/SPUFrameLowering.cpp
new file mode 100644
index 000000000000..fac806e1b0ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUFrameLowering.cpp
@@ -0,0 +1,256 @@
+//===-- SPUTargetMachine.cpp - Define TargetMachine for Cell SPU ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the Cell SPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUFrameLowering.h"
+#include "SPU.h"
+#include "SPUInstrBuilder.h"
+#include "SPUInstrInfo.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// SPUFrameLowering:
+//===----------------------------------------------------------------------===//
+
+SPUFrameLowering::SPUFrameLowering(const SPUSubtarget &sti)
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 16, 0),
+    Subtarget(sti) {
+  LR[0].first = SPU::R0;
+  LR[0].second = 16;
+}
+
+
+//--------------------------------------------------------------------------
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.  This is true if the function needs a frame pointer and has
+// a non-zero stack size.
+bool SPUFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  return MFI->getStackSize() &&
+    (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+     MFI->hasVarSizedObjects());
+}
+
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+void SPUFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize = MFI->getStackSize();
+
+  // Get the alignments provided by the target, and the maximum alignment
+  // (if any) of the fixed frame objects.
+  unsigned TargetAlign = getStackAlignment();
+  unsigned Align = std::max(TargetAlign, MFI->getMaxAlignment());
+  assert(isPowerOf2_32(Align) && "Alignment is not power of 2");
+  unsigned AlignMask = Align - 1;
+
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(maxCallFrameSize);
+
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+void SPUFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const SPUInstrInfo &TII =
+    *static_cast<const SPUInstrInfo*>(MF.getTarget().getInstrInfo());
+  MachineModuleInfo &MMI = MF.getMMI();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Prepare for debug frame info.
+  bool hasDebugInfo = MMI.hasDebugInfo();
+  MCSymbol *FrameLabel = 0;
+
+  // Move MBBI back to the beginning of the function.
+  MBBI = MBB.begin();
+
+  // Work out frame sizes.
+  determineFrameLayout(MF);
+  int FrameSize = MFI->getStackSize();
+
+  assert((FrameSize & 0xf) == 0
+         && "SPURegisterInfo::emitPrologue: FrameSize not aligned");
+
+  // the "empty" frame size is 16 - just the register scavenger spill slot
+  if (FrameSize > 16 || MFI->adjustsStack()) {
+    FrameSize = -(FrameSize + SPUFrameLowering::minStackSize());
+    if (hasDebugInfo) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      FrameLabel = MMI.getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(FrameLabel);
+    }
+
+    // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp)
+    // for the ABI
+    BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16)
+      .addReg(SPU::R1);
+    if (isInt<10>(FrameSize)) {
+      // Spill $sp to adjusted $sp
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize)
+        .addReg(SPU::R1);
+      // Adjust $sp by required amout
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1)
+        .addImm(FrameSize);
+    } else if (isInt<16>(FrameSize)) {
+      // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use
+      // $r2 to adjust $sp:
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2)
+        .addImm(-16)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2)
+        .addImm(FrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQXr32), SPU::R1)
+        .addReg(SPU::R2)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1)
+        .addReg(SPU::R1)
+        .addReg(SPU::R2);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2)
+        .addReg(SPU::R2)
+        .addImm(16);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2)
+        .addReg(SPU::R2)
+        .addReg(SPU::R1);
+    } else {
+      report_fatal_error("Unhandled frame size: " + Twine(FrameSize));
+    }
+
+    if (hasDebugInfo) {
+      std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+
+      // Show update of SP.
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize);
+      Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+
+      // Add callee saved registers to move list.
+      const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+      for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+        int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+        unsigned Reg = CSI[I].getReg();
+        if (Reg == SPU::R0) continue;
+        MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+        MachineLocation CSSrc(Reg);
+        Moves.push_back(MachineMove(FrameLabel, CSDst, CSSrc));
+      }
+
+      // Mark effective beginning of when frame pointer is ready.
+      MCSymbol *ReadyLabel = MMI.getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(ReadyLabel);
+
+      MachineLocation FPDst(SPU::R1);
+      MachineLocation FPSrc(MachineLocation::VirtualFP);
+      Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc));
+    }
+  }
+}
+
+void SPUFrameLowering::emitEpilogue(MachineFunction &MF,
+                                MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const SPUInstrInfo &TII =
+    *static_cast<const SPUInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int FrameSize = MFI->getStackSize();
+  int LinkSlotOffset = SPUFrameLowering::stackSlotSize();
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  assert(MBBI->getOpcode() == SPU::RET &&
+         "Can only insert epilog into returning blocks");
+  assert((FrameSize & 0xf) == 0 && "FrameSize not aligned");
+
+  // the "empty" frame size is 16 - just the register scavenger spill slot
+  if (FrameSize > 16 || MFI->adjustsStack()) {
+    FrameSize = FrameSize + SPUFrameLowering::minStackSize();
+    if (isInt<10>(FrameSize + LinkSlotOffset)) {
+      // Reload $lr, adjust $sp by required amount
+      // Note: We do this to slightly improve dual issue -- not by much, but it
+      // is an opportunity for dual issue.
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0)
+        .addImm(FrameSize + LinkSlotOffset)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1)
+        .addReg(SPU::R1)
+        .addImm(FrameSize);
+    } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) {
+      // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use
+      // $r2 to adjust $sp:
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2)
+        .addImm(16)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2)
+        .addImm(FrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1)
+        .addReg(SPU::R1)
+        .addReg(SPU::R2);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0)
+        .addImm(16)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2).
+        addReg(SPU::R2)
+        .addImm(16);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2)
+        .addReg(SPU::R2)
+        .addReg(SPU::R1);
+    } else {
+      report_fatal_error("Unhandled frame size: " + Twine(FrameSize));
+    }
+  }
+}
+
+void SPUFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                        RegScavenger *RS) const{
+  // Mark LR and SP unused, since the prolog spills them to stack and
+  // we don't want anyone else to spill them for us.
+  //
+  // Also, unless R2 is really used someday, don't spill it automatically.
+  MF.getRegInfo().setPhysRegUnused(SPU::R0);
+  MF.getRegInfo().setPhysRegUnused(SPU::R1);
+  MF.getRegInfo().setPhysRegUnused(SPU::R2);
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterClass *RC = &SPU::R32CRegClass;
+  RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                     RC->getAlignment(),
+                                                     false));
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUFrameLowering.h b/contrib/llvm/lib/Target/CellSPU/SPUFrameLowering.h
new file mode 100644
index 000000000000..11c52818dd9c
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUFrameLowering.h
@@ -0,0 +1,80 @@
+//===-- SPUFrameLowering.h - SPU Frame Lowering stuff ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains CellSPU frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_FRAMEINFO_H
+#define SPU_FRAMEINFO_H
+
+#include "SPURegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class SPUSubtarget;
+
+  class SPUFrameLowering: public TargetFrameLowering {
+    const SPUSubtarget &Subtarget;
+    std::pair<unsigned, int> LR[1];
+
+  public:
+    SPUFrameLowering(const SPUSubtarget &sti);
+
+    //! Determine the frame's layour
+    void determineFrameLayout(MachineFunction &MF) const;
+
+    /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+    /// the function.
+    void emitPrologue(MachineFunction &MF) const;
+    void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+    //! Prediate: Target has dedicated frame pointer
+    bool hasFP(const MachineFunction &MF) const;
+
+    void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                              RegScavenger *RS = NULL) const;
+
+    //! Return a function's saved spill slots
+    /*!
+      For CellSPU, a function's saved spill slots is just the link register.
+     */
+    const std::pair<unsigned, int> *
+    getCalleeSaveSpillSlots(unsigned &NumEntries) const;
+
+    //! Stack slot size (16 bytes)
+    static int stackSlotSize() {
+      return 16;
+    }
+    //! Maximum frame offset representable by a signed 10-bit integer
+    /*!
+      This is the maximum frame offset that can be expressed as a 10-bit
+      integer, used in D-form addresses.
+     */
+    static int maxFrameOffset() {
+      return ((1 << 9) - 1) * stackSlotSize();
+    }
+    //! Minimum frame offset representable by a signed 10-bit integer
+    static int minFrameOffset() {
+      return -(1 << 9) * stackSlotSize();
+    }
+    //! Minimum frame size (enough to spill LR + SP)
+    static int minStackSize() {
+      return (2 * stackSlotSize());
+    }
+    //! Convert frame index to stack offset
+    static int FItoStackOffset(int frame_index) {
+      return frame_index * stackSlotSize();
+    }
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUHazardRecognizers.cpp b/contrib/llvm/lib/Target/CellSPU/SPUHazardRecognizers.cpp
new file mode 100644
index 000000000000..403d7ef1fd9e
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUHazardRecognizers.cpp
@@ -0,0 +1,141 @@
+//===-- SPUHazardRecognizers.cpp - Cell Hazard Recognizer Impls -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on Cell SPU
+// processors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sched"
+
+#include "SPUHazardRecognizers.h"
+#include "SPU.h"
+#include "SPUInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Cell SPU hazard recognizer
+//
+// This is the pipeline hazard recognizer for the Cell SPU processor. It does
+// very little right now.
+//===----------------------------------------------------------------------===//
+
+SPUHazardRecognizer::SPUHazardRecognizer(const TargetInstrInfo &tii) :
+  TII(tii),
+  EvenOdd(0)
+{
+}
+
+/// Return the pipeline hazard type encountered or generated by this
+/// instruction. Currently returns NoHazard.
+///
+/// \return NoHazard
+ScheduleHazardRecognizer::HazardType
+SPUHazardRecognizer::getHazardType(SUnit *SU, int Stalls)
+{
+  // Initial thoughts on how to do this, but this code cannot work unless the
+  // function's prolog and epilog code are also being scheduled so that we can
+  // accurately determine which pipeline is being scheduled.
+#if 0
+  assert(Stalls == 0 && "SPU hazards don't yet support scoreboard lookahead");
+
+  const SDNode *Node = SU->getNode()->getFlaggedMachineNode();
+  ScheduleHazardRecognizer::HazardType retval = NoHazard;
+  bool mustBeOdd = false;
+
+  switch (Node->getOpcode()) {
+  case SPU::LQDv16i8:
+  case SPU::LQDv8i16:
+  case SPU::LQDv4i32:
+  case SPU::LQDv4f32:
+  case SPU::LQDv2f64:
+  case SPU::LQDr128:
+  case SPU::LQDr64:
+  case SPU::LQDr32:
+  case SPU::LQDr16:
+  case SPU::LQAv16i8:
+  case SPU::LQAv8i16:
+  case SPU::LQAv4i32:
+  case SPU::LQAv4f32:
+  case SPU::LQAv2f64:
+  case SPU::LQAr128:
+  case SPU::LQAr64:
+  case SPU::LQAr32:
+  case SPU::LQXv4i32:
+  case SPU::LQXr128:
+  case SPU::LQXr64:
+  case SPU::LQXr32:
+  case SPU::LQXr16:
+  case SPU::STQDv16i8:
+  case SPU::STQDv8i16:
+  case SPU::STQDv4i32:
+  case SPU::STQDv4f32:
+  case SPU::STQDv2f64:
+  case SPU::STQDr128:
+  case SPU::STQDr64:
+  case SPU::STQDr32:
+  case SPU::STQDr16:
+  case SPU::STQDr8:
+  case SPU::STQAv16i8:
+  case SPU::STQAv8i16:
+  case SPU::STQAv4i32:
+  case SPU::STQAv4f32:
+  case SPU::STQAv2f64:
+  case SPU::STQAr128:
+  case SPU::STQAr64:
+  case SPU::STQAr32:
+  case SPU::STQAr16:
+  case SPU::STQAr8:
+  case SPU::STQXv16i8:
+  case SPU::STQXv8i16:
+  case SPU::STQXv4i32:
+  case SPU::STQXv4f32:
+  case SPU::STQXv2f64:
+  case SPU::STQXr128:
+  case SPU::STQXr64:
+  case SPU::STQXr32:
+  case SPU::STQXr16:
+  case SPU::STQXr8:
+  case SPU::RET:
+    mustBeOdd = true;
+    break;
+  default:
+    // Assume that this instruction can be on the even pipe
+    break;
+  }
+
+  if (mustBeOdd && !EvenOdd)
+    retval = Hazard;
+
+  DEBUG(errs() << "SPUHazardRecognizer EvenOdd " << EvenOdd << " Hazard "
+               << retval << "\n");
+  EvenOdd ^= 1;
+  return retval;
+#else
+  return NoHazard;
+#endif
+}
+
+void SPUHazardRecognizer::EmitInstruction(SUnit *SU)
+{
+}
+
+void SPUHazardRecognizer::AdvanceCycle()
+{
+  DEBUG(errs() << "SPUHazardRecognizer::AdvanceCycle\n");
+}
+
+void SPUHazardRecognizer::EmitNoop()
+{
+  AdvanceCycle();
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUHazardRecognizers.h b/contrib/llvm/lib/Target/CellSPU/SPUHazardRecognizers.h
new file mode 100644
index 000000000000..675632cc7f13
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUHazardRecognizers.h
@@ -0,0 +1,41 @@
+//===-- SPUHazardRecognizers.h - Cell SPU Hazard Recognizer -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on the Cell SPU
+// processor.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUHAZRECS_H
+#define SPUHAZRECS_H
+
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+
+namespace llvm {
+
+class TargetInstrInfo;
+
+/// SPUHazardRecognizer
+class SPUHazardRecognizer : public ScheduleHazardRecognizer
+{
+private:
+  const TargetInstrInfo &TII;
+  int EvenOdd;
+
+public:
+  SPUHazardRecognizer(const TargetInstrInfo &TII);
+  virtual HazardType getHazardType(SUnit *SU, int Stalls);
+  virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  virtual void EmitNoop();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
new file mode 100644
index 000000000000..c27caeae7d45
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -0,0 +1,1193 @@
+//===-- SPUISelDAGToDAG.cpp - CellSPU pattern matching inst selector ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for the Cell SPU,
+// converting from a legalized dag to a SPU-target dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPUTargetMachine.h"
+#include "SPUHazardRecognizers.h"
+#include "SPUFrameLowering.h"
+#include "SPUTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
+  bool
+  isI32IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isInt<10>(CN->getSExtValue());
+  }
+
+  //! ConstantSDNode predicate for i32 unsigned 10-bit immediate values
+  bool
+  isI32IntU10Immediate(ConstantSDNode *CN)
+  {
+    return isUInt<10>(CN->getSExtValue());
+  }
+
+  //! ConstantSDNode predicate for i16 sign-extended, 10-bit immediate values
+  bool
+  isI16IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isInt<10>(CN->getSExtValue());
+  }
+
+  //! ConstantSDNode predicate for i16 unsigned 10-bit immediate values
+  bool
+  isI16IntU10Immediate(ConstantSDNode *CN)
+  {
+    return isUInt<10>((short) CN->getZExtValue());
+  }
+
+  //! ConstantSDNode predicate for signed 16-bit values
+  /*!
+    \arg CN The constant SelectionDAG node holding the value
+    \arg Imm The returned 16-bit value, if returning true
+
+    This predicate tests the value in \a CN to see whether it can be
+    represented as a 16-bit, sign-extended quantity. Returns true if
+    this is the case.
+   */
+  bool
+  isIntS16Immediate(ConstantSDNode *CN, short &Imm)
+  {
+    EVT vt = CN->getValueType(0);
+    Imm = (short) CN->getZExtValue();
+    if (vt.getSimpleVT() >= MVT::i1 && vt.getSimpleVT() <= MVT::i16) {
+      return true;
+    } else if (vt == MVT::i32) {
+      int32_t i_val = (int32_t) CN->getZExtValue();
+      short s_val = (short) i_val;
+      return i_val == s_val;
+    } else {
+      int64_t i_val = (int64_t) CN->getZExtValue();
+      short s_val = (short) i_val;
+      return i_val == s_val;
+    }
+  }
+
+  //! ConstantFPSDNode predicate for representing floats as 16-bit sign ext.
+  static bool
+  isFPS16Immediate(ConstantFPSDNode *FPN, short &Imm)
+  {
+    EVT vt = FPN->getValueType(0);
+    if (vt == MVT::f32) {
+      int val = FloatToBits(FPN->getValueAPF().convertToFloat());
+      int sval = (int) ((val << 16) >> 16);
+      Imm = (short) val;
+      return val == sval;
+    }
+
+    return false;
+  }
+
+  //! Generate the carry-generate shuffle mask.
+  SDValue getCarryGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
+    SmallVector<SDValue, 16 > ShufBytes;
+
+    // Create the shuffle mask for "rotating" the borrow up one register slot
+    // once the borrow is generated.
+    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                       &ShufBytes[0], ShufBytes.size());
+  }
+
+  //! Generate the borrow-generate shuffle mask
+  SDValue getBorrowGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
+    SmallVector<SDValue, 16 > ShufBytes;
+
+    // Create the shuffle mask for "rotating" the borrow up one register slot
+    // once the borrow is generated.
+    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                       &ShufBytes[0], ShufBytes.size());
+  }
+
+  //===------------------------------------------------------------------===//
+  /// SPUDAGToDAGISel - Cell SPU-specific code to select SPU machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class SPUDAGToDAGISel :
+    public SelectionDAGISel
+  {
+    const SPUTargetMachine &TM;
+    const SPUTargetLowering &SPUtli;
+    unsigned GlobalBaseReg;
+
+  public:
+    explicit SPUDAGToDAGISel(SPUTargetMachine &tm) :
+      SelectionDAGISel(tm),
+      TM(tm),
+      SPUtli(*tm.getTargetLowering())
+    { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      SelectionDAGISel::runOnMachineFunction(MF);
+      return true;
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(uint32_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDValue getSmallIPtrImm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
+    }
+
+    SDNode *emitBuildVector(SDNode *bvNode) {
+      EVT vecVT = bvNode->getValueType(0);
+      DebugLoc dl = bvNode->getDebugLoc();
+
+      // Check to see if this vector can be represented as a CellSPU immediate
+      // constant by invoking all of the instruction selection predicates:
+      if (((vecVT == MVT::v8i16) &&
+           (SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i16).getNode() != 0)) ||
+          ((vecVT == MVT::v4i32) &&
+           ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_v4i32_imm(bvNode, *CurDAG).getNode() != 0))) ||
+          ((vecVT == MVT::v2i64) &&
+           ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
+            (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
+            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i64).getNode() != 0)))) {
+        HandleSDNode Dummy(SDValue(bvNode, 0));
+        if (SDNode *N = Select(bvNode))
+          return N;
+        return Dummy.getValue().getNode();
+      }
+
+      // No, need to emit a constant pool spill:
+      std::vector<Constant*> CV;
+
+      for (size_t i = 0; i < bvNode->getNumOperands(); ++i) {
+        ConstantSDNode *V = cast<ConstantSDNode > (bvNode->getOperand(i));
+        CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+      }
+
+      const Constant *CP = ConstantVector::get(CV);
+      SDValue CPIdx = CurDAG->getConstantPool(CP, SPUtli.getPointerTy());
+      unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+      SDValue CGPoolOffset =
+              SPU::LowerConstantPool(CPIdx, *CurDAG, TM);
+
+      HandleSDNode Dummy(CurDAG->getLoad(vecVT, dl,
+                                         CurDAG->getEntryNode(), CGPoolOffset,
+                                         MachinePointerInfo::getConstantPool(),
+                                         false, false, false, Alignment));
+      CurDAG->ReplaceAllUsesWith(SDValue(bvNode, 0), Dummy.getValue());
+      if (SDNode *N = SelectCode(Dummy.getValue().getNode()))
+        return N;
+      return Dummy.getValue().getNode();
+    }
+
+    /// Select - Convert the specified operand from a target-independent to a
+    /// target-specific node if it hasn't already been changed.
+    SDNode *Select(SDNode *N);
+
+    //! Emit the instruction sequence for i64 shl
+    SDNode *SelectSHLi64(SDNode *N, EVT OpVT);
+
+    //! Emit the instruction sequence for i64 srl
+    SDNode *SelectSRLi64(SDNode *N, EVT OpVT);
+
+    //! Emit the instruction sequence for i64 sra
+    SDNode *SelectSRAi64(SDNode *N, EVT OpVT);
+
+    //! Emit the necessary sequence for loading i64 constants:
+    SDNode *SelectI64Constant(SDNode *N, EVT OpVT, DebugLoc dl);
+
+    //! Alternate instruction emit sequence for loading i64 constants
+    SDNode *SelectI64Constant(uint64_t i64const, EVT OpVT, DebugLoc dl);
+
+    //! Returns true if the address N is an A-form (local store) address
+    bool SelectAFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    //! D-form address predicate
+    bool SelectDFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    /// Alternate D-form address using i7 offset predicate
+    bool SelectDForm2Addr(SDNode *Op, SDValue N, SDValue &Disp,
+                          SDValue &Base);
+
+    /// D-form address selection workhorse
+    bool DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Disp,
+                               SDValue &Base, int minOffset, int maxOffset);
+
+    //! Address predicate if N can be expressed as an indexed [r+r] operation.
+    bool SelectXFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps) {
+      SDValue Op0, Op1;
+      switch (ConstraintCode) {
+      default: return true;
+      case 'm':   // memory
+        if (!SelectDFormAddr(Op.getNode(), Op, Op0, Op1)
+            && !SelectAFormAddr(Op.getNode(), Op, Op0, Op1))
+          SelectXFormAddr(Op.getNode(), Op, Op0, Op1);
+        break;
+      case 'o':   // offsetable
+        if (!SelectDFormAddr(Op.getNode(), Op, Op0, Op1)
+            && !SelectAFormAddr(Op.getNode(), Op, Op0, Op1)) {
+          Op0 = Op;
+          Op1 = getSmallIPtrImm(0);
+        }
+        break;
+      case 'v':   // not offsetable
+#if 1
+        llvm_unreachable("InlineAsmMemoryOperand 'v' constraint not handled.");
+#else
+        SelectAddrIdxOnly(Op, Op, Op0, Op1);
+        break;
+#endif
+      }
+
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+
+    virtual const char *getPassName() const {
+      return "Cell SPU DAG->DAG Pattern Instruction Selection";
+    }
+
+  private:
+    SDValue getRC( MVT );
+
+    // Include the pieces autogenerated from the target description.
+#include "SPUGenDAGISel.inc"
+  };
+}
+
+/*!
+ \arg Op The ISD instruction operand
+ \arg N The address to be tested
+ \arg Base The base address
+ \arg Index The base address index
+ */
+bool
+SPUDAGToDAGISel::SelectAFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                    SDValue &Index) {
+  // These match the addr256k operand type:
+  EVT OffsVT = MVT::i16;
+  SDValue Zero = CurDAG->getTargetConstant(0, OffsVT);
+  int64_t val;
+
+  switch (N.getOpcode()) {
+  case ISD::Constant:
+    val = dyn_cast<ConstantSDNode>(N.getNode())->getSExtValue();
+    Base = CurDAG->getTargetConstant( val , MVT::i32);
+    Index = Zero;
+    return true;
+  case ISD::ConstantPool:
+  case ISD::GlobalAddress:
+    report_fatal_error("SPU SelectAFormAddr: Pool/Global not lowered.");
+    /*NOTREACHED*/
+
+  case ISD::TargetConstant:
+  case ISD::TargetGlobalAddress:
+  case ISD::TargetJumpTable:
+    report_fatal_error("SPUSelectAFormAddr: Target Constant/Pool/Global "
+                      "not wrapped as A-form address.");
+    /*NOTREACHED*/
+
+  case SPUISD::AFormAddr:
+    // Just load from memory if there's only a single use of the location,
+    // otherwise, this will get handled below with D-form offset addresses
+    if (N.hasOneUse()) {
+      SDValue Op0 = N.getOperand(0);
+      switch (Op0.getOpcode()) {
+      case ISD::TargetConstantPool:
+      case ISD::TargetJumpTable:
+        Base = Op0;
+        Index = Zero;
+        return true;
+
+      case ISD::TargetGlobalAddress: {
+        GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op0);
+        const GlobalValue *GV = GSDN->getGlobal();
+        if (GV->getAlignment() == 16) {
+          Base = Op0;
+          Index = Zero;
+          return true;
+        }
+        break;
+      }
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+bool
+SPUDAGToDAGISel::SelectDForm2Addr(SDNode *Op, SDValue N, SDValue &Disp,
+                                  SDValue &Base) {
+  const int minDForm2Offset = -(1 << 7);
+  const int maxDForm2Offset = (1 << 7) - 1;
+  return DFormAddressPredicate(Op, N, Disp, Base, minDForm2Offset,
+                               maxDForm2Offset);
+}
+
+/*!
+  \arg Op The ISD instruction (ignored)
+  \arg N The address to be tested
+  \arg Base Base address register/pointer
+  \arg Index Base address index
+
+  Examine the input address by a base register plus a signed 10-bit
+  displacement, [r+I10] (D-form address).
+
+  \return true if \a N is a D-form address with \a Base and \a Index set
+  to non-empty SDValue instances.
+*/
+bool
+SPUDAGToDAGISel::SelectDFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                                 SDValue &Index) {
+  return DFormAddressPredicate(Op, N, Base, Index,
+                               SPUFrameLowering::minFrameOffset(),
+                               SPUFrameLowering::maxFrameOffset());
+}
+
+bool
+SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base,
+                                      SDValue &Index, int minOffset,
+                                      int maxOffset) {
+  unsigned Opc = N.getOpcode();
+  EVT PtrTy = SPUtli.getPointerTy();
+
+  if (Opc == ISD::FrameIndex) {
+    // Stack frame index must be less than 512 (divided by 16):
+    FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(N);
+    int FI = int(FIN->getIndex());
+    DEBUG(errs() << "SelectDFormAddr: ISD::FrameIndex = "
+               << FI << "\n");
+    if (SPUFrameLowering::FItoStackOffset(FI) < maxOffset) {
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = CurDAG->getTargetFrameIndex(FI, PtrTy);
+      return true;
+    }
+  } else if (Opc == ISD::ADD) {
+    // Generated by getelementptr
+    const SDValue Op0 = N.getOperand(0);
+    const SDValue Op1 = N.getOperand(1);
+
+    if ((Op0.getOpcode() == SPUISD::Hi && Op1.getOpcode() == SPUISD::Lo)
+        || (Op1.getOpcode() == SPUISD::Hi && Op0.getOpcode() == SPUISD::Lo)) {
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = N;
+      return true;
+    } else if (Op1.getOpcode() == ISD::Constant
+               || Op1.getOpcode() == ISD::TargetConstant) {
+      ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
+      int32_t offset = int32_t(CN->getSExtValue());
+
+      if (Op0.getOpcode() == ISD::FrameIndex) {
+        FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op0);
+        int FI = int(FIN->getIndex());
+        DEBUG(errs() << "SelectDFormAddr: ISD::ADD offset = " << offset
+                   << " frame index = " << FI << "\n");
+
+        if (SPUFrameLowering::FItoStackOffset(FI) < maxOffset) {
+          Base = CurDAG->getTargetConstant(offset, PtrTy);
+          Index = CurDAG->getTargetFrameIndex(FI, PtrTy);
+          return true;
+        }
+      } else if (offset > minOffset && offset < maxOffset) {
+        Base = CurDAG->getTargetConstant(offset, PtrTy);
+        Index = Op0;
+        return true;
+      }
+    } else if (Op0.getOpcode() == ISD::Constant
+               || Op0.getOpcode() == ISD::TargetConstant) {
+      ConstantSDNode *CN = cast<ConstantSDNode>(Op0);
+      int32_t offset = int32_t(CN->getSExtValue());
+
+      if (Op1.getOpcode() == ISD::FrameIndex) {
+        FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op1);
+        int FI = int(FIN->getIndex());
+        DEBUG(errs() << "SelectDFormAddr: ISD::ADD offset = " << offset
+                   << " frame index = " << FI << "\n");
+
+        if (SPUFrameLowering::FItoStackOffset(FI) < maxOffset) {
+          Base = CurDAG->getTargetConstant(offset, PtrTy);
+          Index = CurDAG->getTargetFrameIndex(FI, PtrTy);
+          return true;
+        }
+      } else if (offset > minOffset && offset < maxOffset) {
+        Base = CurDAG->getTargetConstant(offset, PtrTy);
+        Index = Op1;
+        return true;
+      }
+    }
+  } else if (Opc == SPUISD::IndirectAddr) {
+    // Indirect with constant offset -> D-Form address
+    const SDValue Op0 = N.getOperand(0);
+    const SDValue Op1 = N.getOperand(1);
+
+    if (Op0.getOpcode() == SPUISD::Hi
+        && Op1.getOpcode() == SPUISD::Lo) {
+      // (SPUindirect (SPUhi <arg>, 0), (SPUlo <arg>, 0))
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = N;
+      return true;
+    } else if (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1)) {
+      int32_t offset = 0;
+      SDValue idxOp;
+
+      if (isa<ConstantSDNode>(Op1)) {
+        ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
+        offset = int32_t(CN->getSExtValue());
+        idxOp = Op0;
+      } else if (isa<ConstantSDNode>(Op0)) {
+        ConstantSDNode *CN = cast<ConstantSDNode>(Op0);
+        offset = int32_t(CN->getSExtValue());
+        idxOp = Op1;
+      }
+
+      if (offset >= minOffset && offset <= maxOffset) {
+        Base = CurDAG->getTargetConstant(offset, PtrTy);
+        Index = idxOp;
+        return true;
+      }
+    }
+  } else if (Opc == SPUISD::AFormAddr) {
+    Base = CurDAG->getTargetConstant(0, N.getValueType());
+    Index = N;
+    return true;
+  } else if (Opc == SPUISD::LDRESULT) {
+    Base = CurDAG->getTargetConstant(0, N.getValueType());
+    Index = N;
+    return true;
+  } else if (Opc == ISD::Register
+           ||Opc == ISD::CopyFromReg
+           ||Opc == ISD::UNDEF
+           ||Opc == ISD::Constant) {
+    unsigned OpOpc = Op->getOpcode();
+
+    if (OpOpc == ISD::STORE || OpOpc == ISD::LOAD) {
+      // Direct load/store without getelementptr
+      SDValue Offs;
+
+      Offs = ((OpOpc == ISD::STORE) ? Op->getOperand(3) : Op->getOperand(2));
+
+      if (Offs.getOpcode() == ISD::Constant || Offs.getOpcode() == ISD::UNDEF) {
+        if (Offs.getOpcode() == ISD::UNDEF)
+          Offs = CurDAG->getTargetConstant(0, Offs.getValueType());
+
+        Base = Offs;
+        Index = N;
+        return true;
+      }
+    } else {
+      /* If otherwise unadorned, default to D-form address with 0 offset: */
+      if (Opc == ISD::CopyFromReg) {
+        Index = N.getOperand(1);
+      } else {
+        Index = N;
+      }
+
+      Base = CurDAG->getTargetConstant(0, Index.getValueType());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/*!
+  \arg Op The ISD instruction operand
+  \arg N The address operand
+  \arg Base The base pointer operand
+  \arg Index The offset/index operand
+
+  If the address \a N can be expressed as an A-form or D-form address, returns
+  false.  Otherwise, creates two operands, Base and Index that will become the
+  (r)(r) X-form address.
+*/
+bool
+SPUDAGToDAGISel::SelectXFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                                 SDValue &Index) {
+  if (!SelectAFormAddr(Op, N, Base, Index)
+      && !SelectDFormAddr(Op, N, Base, Index)) {
+    // If the address is neither A-form or D-form, punt and use an X-form
+    // address:
+    Base = N.getOperand(1);
+    Index = N.getOperand(0);
+    return true;
+  }
+
+  return false;
+}
+
+/*!
+ Utility function to use with COPY_TO_REGCLASS instructions. Returns a SDValue
+ to be used as the last parameter of a
+CurDAG->getMachineNode(COPY_TO_REGCLASS,..., ) function call
+ \arg VT the value type for which we want a register class
+*/
+SDValue SPUDAGToDAGISel::getRC( MVT VT ) {
+  switch( VT.SimpleTy ) {
+  case MVT::i8:
+    return CurDAG->getTargetConstant(SPU::R8CRegClass.getID(), MVT::i32);
+  case MVT::i16:
+    return CurDAG->getTargetConstant(SPU::R16CRegClass.getID(), MVT::i32);
+  case MVT::i32:
+    return CurDAG->getTargetConstant(SPU::R32CRegClass.getID(), MVT::i32);
+  case MVT::f32:
+    return CurDAG->getTargetConstant(SPU::R32FPRegClass.getID(), MVT::i32);
+  case MVT::i64:
+    return CurDAG->getTargetConstant(SPU::R64CRegClass.getID(), MVT::i32);
+  case MVT::i128:
+    return CurDAG->getTargetConstant(SPU::GPRCRegClass.getID(), MVT::i32);
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+  case MVT::v4f32:
+  case MVT::v2i64:
+  case MVT::v2f64:
+    return CurDAG->getTargetConstant(SPU::VECREGRegClass.getID(), MVT::i32);
+  default:
+    assert( false && "add a new case here" );
+    return SDValue();
+  }
+}
+
+//! Convert the operand from a target-independent to a target-specific node
+/*!
+ */
+SDNode *
+SPUDAGToDAGISel::Select(SDNode *N) {
+  unsigned Opc = N->getOpcode();
+  int n_ops = -1;
+  unsigned NewOpc = 0;
+  EVT OpVT = N->getValueType(0);
+  SDValue Ops[8];
+  DebugLoc dl = N->getDebugLoc();
+
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  if (Opc == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
+    SDValue Imm0 = CurDAG->getTargetConstant(0, N->getValueType(0));
+
+    if (FI < 128) {
+      NewOpc = SPU::AIr32;
+      Ops[0] = TFI;
+      Ops[1] = Imm0;
+      n_ops = 2;
+    } else {
+      NewOpc = SPU::Ar32;
+      Ops[0] = CurDAG->getRegister(SPU::R1, N->getValueType(0));
+      Ops[1] = SDValue(CurDAG->getMachineNode(SPU::ILAr32, dl,
+                                              N->getValueType(0), TFI),
+                       0);
+      n_ops = 2;
+    }
+  } else if (Opc == ISD::Constant && OpVT == MVT::i64) {
+    // Catch the i64 constants that end up here. Note: The backend doesn't
+    // attempt to legalize the constant (it's useless because DAGCombiner
+    // will insert 64-bit constants and we can't stop it).
+    return SelectI64Constant(N, OpVT, N->getDebugLoc());
+  } else if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND)
+             && OpVT == MVT::i64) {
+    SDValue Op0 = N->getOperand(0);
+    EVT Op0VT = Op0.getValueType();
+    EVT Op0VecVT = EVT::getVectorVT(*CurDAG->getContext(),
+                                    Op0VT, (128 / Op0VT.getSizeInBits()));
+    EVT OpVecVT = EVT::getVectorVT(*CurDAG->getContext(),
+                                   OpVT, (128 / OpVT.getSizeInBits()));
+    SDValue shufMask;
+
+    switch (Op0VT.getSimpleVT().SimpleTy) {
+    default:
+      report_fatal_error("CellSPU Select: Unhandled zero/any extend EVT");
+      /*NOTREACHED*/
+    case MVT::i32:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x00010203, MVT::i32),
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x08090a0b, MVT::i32));
+      break;
+
+    case MVT::i16:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x80800203, MVT::i32),
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x80800a0b, MVT::i32));
+      break;
+
+    case MVT::i8:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x80808003, MVT::i32),
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x8080800b, MVT::i32));
+      break;
+    }
+
+    SDNode *shufMaskLoad = emitBuildVector(shufMask.getNode());
+
+    HandleSDNode PromoteScalar(CurDAG->getNode(SPUISD::PREFSLOT2VEC, dl,
+                                               Op0VecVT, Op0));
+
+    SDValue PromScalar;
+    if (SDNode *N = SelectCode(PromoteScalar.getValue().getNode()))
+      PromScalar = SDValue(N, 0);
+    else
+      PromScalar = PromoteScalar.getValue();
+
+    SDValue zextShuffle =
+            CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT,
+                            PromScalar, PromScalar,
+                            SDValue(shufMaskLoad, 0));
+
+    HandleSDNode Dummy2(zextShuffle);
+    if (SDNode *N = SelectCode(Dummy2.getValue().getNode()))
+      zextShuffle = SDValue(N, 0);
+    else
+      zextShuffle = Dummy2.getValue();
+    HandleSDNode Dummy(CurDAG->getNode(SPUISD::VEC2PREFSLOT, dl, OpVT,
+                                       zextShuffle));
+
+    CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode());
+    SelectCode(Dummy.getValue().getNode());
+    return Dummy.getValue().getNode();
+  } else if (Opc == ISD::ADD && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl).getNode());
+
+    HandleSDNode Dummy(CurDAG->getNode(SPUISD::ADD64_MARKER, dl, OpVT,
+                                       N->getOperand(0), N->getOperand(1),
+                                       SDValue(CGLoad, 0)));
+
+    CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode());
+    if (SDNode *N = SelectCode(Dummy.getValue().getNode()))
+      return N;
+    return Dummy.getValue().getNode();
+  } else if (Opc == ISD::SUB && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(getBorrowGenerateShufMask(*CurDAG, dl).getNode());
+
+    HandleSDNode Dummy(CurDAG->getNode(SPUISD::SUB64_MARKER, dl, OpVT,
+                                       N->getOperand(0), N->getOperand(1),
+                                       SDValue(CGLoad, 0)));
+
+    CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode());
+    if (SDNode *N = SelectCode(Dummy.getValue().getNode()))
+      return N;
+    return Dummy.getValue().getNode();
+  } else if (Opc == ISD::MUL && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl).getNode());
+
+    HandleSDNode Dummy(CurDAG->getNode(SPUISD::MUL64_MARKER, dl, OpVT,
+                                       N->getOperand(0), N->getOperand(1),
+                                       SDValue(CGLoad, 0)));
+    CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode());
+    if (SDNode *N = SelectCode(Dummy.getValue().getNode()))
+      return N;
+    return Dummy.getValue().getNode();
+  } else if (Opc == ISD::TRUNCATE) {
+    SDValue Op0 = N->getOperand(0);
+    if ((Op0.getOpcode() == ISD::SRA || Op0.getOpcode() == ISD::SRL)
+        && OpVT == MVT::i32
+        && Op0.getValueType() == MVT::i64) {
+      // Catch (truncate:i32 ([sra|srl]:i64 arg, c), where c >= 32
+      //
+      // Take advantage of the fact that the upper 32 bits are in the
+      // i32 preferred slot and avoid shuffle gymnastics:
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+      if (CN != 0) {
+        unsigned shift_amt = unsigned(CN->getZExtValue());
+
+        if (shift_amt >= 32) {
+          SDNode *hi32 =
+                  CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, OpVT,
+                                         Op0.getOperand(0), getRC(MVT::i32));
+
+          shift_amt -= 32;
+          if (shift_amt > 0) {
+            // Take care of the additional shift, if present:
+            SDValue shift = CurDAG->getTargetConstant(shift_amt, MVT::i32);
+            unsigned Opc = SPU::ROTMAIr32_i32;
+
+            if (Op0.getOpcode() == ISD::SRL)
+              Opc = SPU::ROTMr32;
+
+            hi32 = CurDAG->getMachineNode(Opc, dl, OpVT, SDValue(hi32, 0),
+                                          shift);
+          }
+
+          return hi32;
+        }
+      }
+    }
+  } else if (Opc == ISD::SHL) {
+    if (OpVT == MVT::i64)
+      return SelectSHLi64(N, OpVT);
+  } else if (Opc == ISD::SRL) {
+    if (OpVT == MVT::i64)
+      return SelectSRLi64(N, OpVT);
+  } else if (Opc == ISD::SRA) {
+    if (OpVT == MVT::i64)
+      return SelectSRAi64(N, OpVT);
+  } else if (Opc == ISD::FNEG
+             && (OpVT == MVT::f64 || OpVT == MVT::v2f64)) {
+    DebugLoc dl = N->getDebugLoc();
+    // Check if the pattern is a special form of DFNMS:
+    // (fneg (fsub (fmul R64FP:$rA, R64FP:$rB), R64FP:$rC))
+    SDValue Op0 = N->getOperand(0);
+    if (Op0.getOpcode() == ISD::FSUB) {
+      SDValue Op00 = Op0.getOperand(0);
+      if (Op00.getOpcode() == ISD::FMUL) {
+        unsigned Opc = SPU::DFNMSf64;
+        if (OpVT == MVT::v2f64)
+          Opc = SPU::DFNMSv2f64;
+
+        return CurDAG->getMachineNode(Opc, dl, OpVT,
+                                      Op00.getOperand(0),
+                                      Op00.getOperand(1),
+                                      Op0.getOperand(1));
+      }
+    }
+
+    SDValue negConst = CurDAG->getConstant(0x8000000000000000ULL, MVT::i64);
+    SDNode *signMask = 0;
+    unsigned Opc = SPU::XORfneg64;
+
+    if (OpVT == MVT::f64) {
+      signMask = SelectI64Constant(negConst.getNode(), MVT::i64, dl);
+    } else if (OpVT == MVT::v2f64) {
+      Opc = SPU::XORfnegvec;
+      signMask = emitBuildVector(CurDAG->getNode(ISD::BUILD_VECTOR, dl,
+                                                 MVT::v2i64,
+                                                 negConst, negConst).getNode());
+    }
+
+    return CurDAG->getMachineNode(Opc, dl, OpVT,
+                                  N->getOperand(0), SDValue(signMask, 0));
+  } else if (Opc == ISD::FABS) {
+    if (OpVT == MVT::f64) {
+      SDNode *signMask = SelectI64Constant(0x7fffffffffffffffULL, MVT::i64, dl);
+      return CurDAG->getMachineNode(SPU::ANDfabs64, dl, OpVT,
+                                    N->getOperand(0), SDValue(signMask, 0));
+    } else if (OpVT == MVT::v2f64) {
+      SDValue absConst = CurDAG->getConstant(0x7fffffffffffffffULL, MVT::i64);
+      SDValue absVec = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
+                                       absConst, absConst);
+      SDNode *signMask = emitBuildVector(absVec.getNode());
+      return CurDAG->getMachineNode(SPU::ANDfabsvec, dl, OpVT,
+                                    N->getOperand(0), SDValue(signMask, 0));
+    }
+  } else if (Opc == SPUISD::LDRESULT) {
+    // Custom select instructions for LDRESULT
+    EVT VT = N->getValueType(0);
+    SDValue Arg = N->getOperand(0);
+    SDValue Chain = N->getOperand(1);
+    SDNode *Result;
+
+    Result = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, VT,
+                                    MVT::Other, Arg,
+                                    getRC( VT.getSimpleVT()), Chain);
+    return Result;
+
+  } else if (Opc == SPUISD::IndirectAddr) {
+    // Look at the operands: SelectCode() will catch the cases that aren't
+    // specifically handled here.
+    //
+    // SPUInstrInfo catches the following patterns:
+    // (SPUindirect (SPUhi ...), (SPUlo ...))
+    // (SPUindirect $sp, imm)
+    EVT VT = N->getValueType(0);
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    RegisterSDNode *RN;
+
+    if ((Op0.getOpcode() != SPUISD::Hi && Op1.getOpcode() != SPUISD::Lo)
+        || (Op0.getOpcode() == ISD::Register
+            && ((RN = dyn_cast<RegisterSDNode>(Op0.getNode())) != 0
+                && RN->getReg() != SPU::R1))) {
+      NewOpc = SPU::Ar32;
+      Ops[1] = Op1;
+      if (Op1.getOpcode() == ISD::Constant) {
+        ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
+        Op1 = CurDAG->getTargetConstant(CN->getSExtValue(), VT);
+        if (isInt<10>(CN->getSExtValue())) {
+          NewOpc = SPU::AIr32;
+          Ops[1] = Op1;
+        } else {
+          Ops[1] = SDValue(CurDAG->getMachineNode(SPU::ILr32, dl,
+                                                  N->getValueType(0),
+                                                  Op1),
+                           0);
+        }
+      }
+      Ops[0] = Op0;
+      n_ops = 2;
+    }
+  }
+
+  if (n_ops > 0) {
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, NewOpc, OpVT, Ops, n_ops);
+    else
+      return CurDAG->getMachineNode(NewOpc, dl, OpVT, Ops, n_ops);
+  } else
+    return SelectCode(N);
+}
+
+/*!
+ * Emit the instruction sequence for i64 left shifts. The basic algorithm
+ * is to fill the bottom two word slots with zeros so that zeros are shifted
+ * in as the entire quadword is shifted left.
+ *
+ * \note This code could also be used to implement v2i64 shl.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSHLi64(SDNode *N, EVT OpVT) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(),
+                               OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = N->getOperand(1);
+  EVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *SelMask, *ZeroFill, *Shift = 0;
+  SDValue SelMaskVal;
+  DebugLoc dl = N->getDebugLoc();
+
+  VecOp0 = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, VecVT,
+                                  Op0, getRC(MVT::v2i64) );
+  SelMaskVal = CurDAG->getTargetConstant(0xff00ULL, MVT::i16);
+  SelMask = CurDAG->getMachineNode(SPU::FSMBIv2i64, dl, VecVT, SelMaskVal);
+  ZeroFill = CurDAG->getMachineNode(SPU::ILv2i64, dl, VecVT,
+                                    CurDAG->getTargetConstant(0, OpVT));
+  VecOp0 = CurDAG->getMachineNode(SPU::SELBv2i64, dl, VecVT,
+                                  SDValue(ZeroFill, 0),
+                                  SDValue(VecOp0, 0),
+                                  SDValue(SelMask, 0));
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getMachineNode(SPU::SHLQBYIv2i64, dl, VecVT,
+                               SDValue(VecOp0, 0),
+                               CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getMachineNode(SPU::SHLQBIIv2i64, dl, VecVT,
+                               SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                               CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getMachineNode(SPU::ROTMIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getMachineNode(SPU::ANDIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(7, ShiftAmtVT));
+    Shift =
+      CurDAG->getMachineNode(SPU::SHLQBYv2i64, dl, VecVT,
+                             SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getMachineNode(SPU::SHLQBIv2i64, dl, VecVT,
+                             SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl,
+                                OpVT, SDValue(Shift, 0), getRC(MVT::i64));
+}
+
+/*!
+ * Emit the instruction sequence for i64 logical right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRLi64(SDNode *N, EVT OpVT) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(),
+                               OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = N->getOperand(1);
+  EVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *Shift = 0;
+  DebugLoc dl = N->getDebugLoc();
+
+  VecOp0 = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, VecVT,
+                                  Op0, getRC(MVT::v2i64) );
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getMachineNode(SPU::ROTQMBYIv2i64, dl, VecVT,
+                               SDValue(VecOp0, 0),
+                               CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getMachineNode(SPU::ROTQMBIIv2i64, dl, VecVT,
+                               SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                               CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getMachineNode(SPU::ROTMIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getMachineNode(SPU::ANDIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(7, ShiftAmtVT));
+
+    // Ensure that the shift amounts are negated!
+    Bytes = CurDAG->getMachineNode(SPU::SFIr32, dl, ShiftAmtVT,
+                                   SDValue(Bytes, 0),
+                                   CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Bits = CurDAG->getMachineNode(SPU::SFIr32, dl, ShiftAmtVT,
+                                  SDValue(Bits, 0),
+                                  CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getMachineNode(SPU::ROTQMBYv2i64, dl, VecVT,
+                             SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getMachineNode(SPU::ROTQMBIv2i64, dl, VecVT,
+                             SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl,
+                                OpVT, SDValue(Shift, 0), getRC(MVT::i64));
+}
+
+/*!
+ * Emit the instruction sequence for i64 arithmetic right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRAi64(SDNode *N, EVT OpVT) {
+  // Promote Op0 to vector
+  EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(),
+                               OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = N->getOperand(1);
+  EVT ShiftAmtVT = ShiftAmt.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDNode *VecOp0 =
+    CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl,
+                           VecVT, N->getOperand(0), getRC(MVT::v2i64));
+
+  SDValue SignRotAmt = CurDAG->getTargetConstant(31, ShiftAmtVT);
+  SDNode *SignRot =
+    CurDAG->getMachineNode(SPU::ROTMAIv2i64_i32, dl, MVT::v2i64,
+                           SDValue(VecOp0, 0), SignRotAmt);
+  SDNode *UpperHalfSign =
+    CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl,
+                           MVT::i32, SDValue(SignRot, 0), getRC(MVT::i32));
+
+  SDNode *UpperHalfSignMask =
+    CurDAG->getMachineNode(SPU::FSM64r32, dl, VecVT, SDValue(UpperHalfSign, 0));
+  SDNode *UpperLowerMask =
+    CurDAG->getMachineNode(SPU::FSMBIv2i64, dl, VecVT,
+                           CurDAG->getTargetConstant(0xff00ULL, MVT::i16));
+  SDNode *UpperLowerSelect =
+    CurDAG->getMachineNode(SPU::SELBv2i64, dl, VecVT,
+                           SDValue(UpperHalfSignMask, 0),
+                           SDValue(VecOp0, 0),
+                           SDValue(UpperLowerMask, 0));
+
+  SDNode *Shift = 0;
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      bytes = 31 - bytes;
+      Shift =
+        CurDAG->getMachineNode(SPU::ROTQBYIv2i64, dl, VecVT,
+                               SDValue(UpperLowerSelect, 0),
+                               CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      bits = 8 - bits;
+      Shift =
+        CurDAG->getMachineNode(SPU::ROTQBIIv2i64, dl, VecVT,
+                               SDValue((Shift != 0 ? Shift : UpperLowerSelect), 0),
+                               CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *NegShift =
+      CurDAG->getMachineNode(SPU::SFIr32, dl, ShiftAmtVT,
+                             ShiftAmt, CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getMachineNode(SPU::ROTQBYBIv2i64_r32, dl, VecVT,
+                             SDValue(UpperLowerSelect, 0), SDValue(NegShift, 0));
+    Shift =
+      CurDAG->getMachineNode(SPU::ROTQBIv2i64, dl, VecVT,
+                             SDValue(Shift, 0), SDValue(NegShift, 0));
+  }
+
+  return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl,
+                                OpVT, SDValue(Shift, 0), getRC(MVT::i64));
+}
+
+/*!
+ Do the necessary magic necessary to load a i64 constant
+ */
+SDNode *SPUDAGToDAGISel::SelectI64Constant(SDNode *N, EVT OpVT,
+                                           DebugLoc dl) {
+  ConstantSDNode *CN = cast<ConstantSDNode>(N);
+  return SelectI64Constant(CN->getZExtValue(), OpVT, dl);
+}
+
+SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT,
+                                           DebugLoc dl) {
+  EVT OpVecVT = EVT::getVectorVT(*CurDAG->getContext(), OpVT, 2);
+  SDValue i64vec =
+          SPU::LowerV2I64Splat(OpVecVT, *CurDAG, Value64, dl);
+
+  // Here's where it gets interesting, because we have to parse out the
+  // subtree handed back in i64vec:
+
+  if (i64vec.getOpcode() == ISD::BITCAST) {
+    // The degenerate case where the upper and lower bits in the splat are
+    // identical:
+    SDValue Op0 = i64vec.getOperand(0);
+
+    ReplaceUses(i64vec, Op0);
+    return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, OpVT,
+                                  SDValue(emitBuildVector(Op0.getNode()), 0),
+                                  getRC(MVT::i64));
+  } else if (i64vec.getOpcode() == SPUISD::SHUFB) {
+    SDValue lhs = i64vec.getOperand(0);
+    SDValue rhs = i64vec.getOperand(1);
+    SDValue shufmask = i64vec.getOperand(2);
+
+    if (lhs.getOpcode() == ISD::BITCAST) {
+      ReplaceUses(lhs, lhs.getOperand(0));
+      lhs = lhs.getOperand(0);
+    }
+
+    SDNode *lhsNode = (lhs.getNode()->isMachineOpcode()
+                       ? lhs.getNode()
+                       : emitBuildVector(lhs.getNode()));
+
+    if (rhs.getOpcode() == ISD::BITCAST) {
+      ReplaceUses(rhs, rhs.getOperand(0));
+      rhs = rhs.getOperand(0);
+    }
+
+    SDNode *rhsNode = (rhs.getNode()->isMachineOpcode()
+                       ? rhs.getNode()
+                       : emitBuildVector(rhs.getNode()));
+
+    if (shufmask.getOpcode() == ISD::BITCAST) {
+      ReplaceUses(shufmask, shufmask.getOperand(0));
+      shufmask = shufmask.getOperand(0);
+    }
+
+    SDNode *shufMaskNode = (shufmask.getNode()->isMachineOpcode()
+                            ? shufmask.getNode()
+                            : emitBuildVector(shufmask.getNode()));
+
+   SDValue shufNode =
+            CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT,
+                                   SDValue(lhsNode, 0), SDValue(rhsNode, 0),
+                                   SDValue(shufMaskNode, 0));
+    HandleSDNode Dummy(shufNode);
+    SDNode *SN = SelectCode(Dummy.getValue().getNode());
+    if (SN == 0) SN = Dummy.getValue().getNode();
+
+    return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl,
+                                  OpVT, SDValue(SN, 0), getRC(MVT::i64));
+  } else if (i64vec.getOpcode() == ISD::BUILD_VECTOR) {
+    return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, OpVT,
+                                  SDValue(emitBuildVector(i64vec.getNode()), 0),
+                                  getRC(MVT::i64));
+  } else {
+    report_fatal_error("SPUDAGToDAGISel::SelectI64Constant: Unhandled i64vec"
+                      "condition");
+  }
+}
+
+/// createSPUISelDag - This pass converts a legalized DAG into a
+/// SPU-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSPUISelDag(SPUTargetMachine &TM) {
+  return new SPUDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUISelLowering.cpp b/contrib/llvm/lib/Target/CellSPU/SPUISelLowering.cpp
new file mode 100644
index 000000000000..062374127e2f
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -0,0 +1,3259 @@
+//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPUTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUISelLowering.h"
+#include "SPUTargetMachine.h"
+#include "SPUFrameLowering.h"
+#include "SPUMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  // Byte offset of the preferred slot (counted from the MSB)
+  int prefslotOffset(EVT VT) {
+    int retval=0;
+    if (VT==MVT::i1) retval=3;
+    if (VT==MVT::i8) retval=3;
+    if (VT==MVT::i16) retval=2;
+
+    return retval;
+  }
+
+  //! Expand a library call into an actual call DAG node
+  /*!
+   \note
+   This code is taken from SelectionDAGLegalize, since it is not exposed as
+   part of the LLVM SelectionDAG API.
+   */
+
+  SDValue
+  ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
+                bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
+    // The input chain to this libcall is the entry node of the function.
+    // Legalizing the call will automatically add the previous call to the
+    // dependence.
+    SDValue InChain = DAG.getEntryNode();
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      EVT ArgVT = Op.getOperand(i).getValueType();
+      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+      Entry.Node = Op.getOperand(i);
+      Entry.Ty = ArgTy;
+      Entry.isSExt = isSigned;
+      Entry.isZExt = !isSigned;
+      Args.push_back(Entry);
+    }
+    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                           TLI.getPointerTy());
+
+    // Splice the libcall in wherever FindInputOutputChains tells us to.
+    Type *RetTy =
+                Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
+    std::pair<SDValue, SDValue> CallInfo =
+            TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                            0, TLI.getLibcallCallingConv(LC),
+                            /*isTailCall=*/false,
+                            /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                            Callee, Args, DAG, Op.getDebugLoc());
+
+    return CallInfo.first;
+  }
+}
+
+SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()),
+    SPUTM(TM) {
+
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+
+  // Set RTLIB libcall names as used by SPU:
+  setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
+
+  // Set up the SPU's register classes:
+  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
+  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
+  addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
+  addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
+  addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
+  addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
+  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
+
+  // SPU has no sign or zero extended loads for i1, i8, i16:
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
+
+  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i8, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // SPU constant load actions are custom lowered:
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
+
+  // SPU's loads and stores have to be custom lowered:
+  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
+       ++sctype) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
+
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+    setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
+      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
+  }
+
+  for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
+       ++sctype) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
+
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
+      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
+  }
+
+  // Expand the jumptable branches
+  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
+
+  // Custom lower SELECT_CC for most cases, but expand by default
+  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
+
+  // SPU has no intrinsics for these particular operations:
+  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
+
+  // SPU has no division/remainder instructions
+  setOperationAction(ISD::SREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::UREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::SREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i128, Expand);
+  setOperationAction(ISD::UREM,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::UDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
+
+  // We don't support sin/cos/sqrt/fmod
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+
+  // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
+  // for f32!)
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+  setOperationAction(ISD::FMA, MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f32, Expand);
+
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+  // SPU can do rotate right and left, so legalize it... but customize for i8
+  // because instructions don't exist.
+
+  // FIXME: Change from "expand" to appropriate type once ROTR is supported in
+  //        .td files.
+  setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
+  setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
+  setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
+
+  setOperationAction(ISD::ROTL, MVT::i32,    Legal);
+  setOperationAction(ISD::ROTL, MVT::i16,    Legal);
+  setOperationAction(ISD::ROTL, MVT::i8,     Custom);
+
+  // SPU has no native version of shift left/right for i8
+  setOperationAction(ISD::SHL,  MVT::i8,     Custom);
+  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
+  setOperationAction(ISD::SRA,  MVT::i8,     Custom);
+
+  // Make these operations legal and handle them during instruction selection:
+  setOperationAction(ISD::SHL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRA,  MVT::i64,    Legal);
+
+  // Custom lower i8, i32 and i64 multiplications
+  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
+  setOperationAction(ISD::MUL,  MVT::i32,    Legal);
+  setOperationAction(ISD::MUL,  MVT::i64,    Legal);
+
+  // Expand double-width multiplication
+  // FIXME: It would probably be reasonable to support some of these operations
+  setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
+  setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
+  setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i16, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i16, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i32, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i64, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i64, Expand);
+
+  // Need to custom handle (some) common i8, i64 math ops
+  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
+  setOperationAction(ISD::ADD,  MVT::i64,    Legal);
+  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
+  setOperationAction(ISD::SUB,  MVT::i64,    Legal);
+
+  // SPU does not have BSWAP. It does have i32 support CTLZ.
+  // CTPOP has to be custom lowered.
+  setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
+  setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
+
+  setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i128,  Expand);
+
+  setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
+  setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
+  setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
+  setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i128,  Expand);
+
+  // SPU has a version of select that implements (a&~c)|(b&c), just like
+  // select ought to work:
+  setOperationAction(ISD::SELECT, MVT::i8,   Legal);
+  setOperationAction(ISD::SELECT, MVT::i16,  Legal);
+  setOperationAction(ISD::SELECT, MVT::i32,  Legal);
+  setOperationAction(ISD::SELECT, MVT::i64,  Legal);
+
+  setOperationAction(ISD::SETCC, MVT::i8,    Legal);
+  setOperationAction(ISD::SETCC, MVT::i16,   Legal);
+  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
+  setOperationAction(ISD::SETCC, MVT::i64,   Legal);
+  setOperationAction(ISD::SETCC, MVT::f64,   Custom);
+
+  // Custom lower i128 -> i64 truncates
+  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
+
+  // Custom lower i32/i64 -> i128 sign extend
+  setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
+
+  setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+  // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
+  // to expand to a libcall, hence the custom lowering:
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
+
+  // FDIV on SPU requires custom lowering
+  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
+
+  // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
+  setOperationAction(ISD::BITCAST, MVT::i32, Legal);
+  setOperationAction(ISD::BITCAST, MVT::f32, Legal);
+  setOperationAction(ISD::BITCAST, MVT::i64, Legal);
+  setOperationAction(ISD::BITCAST, MVT::f64, Legal);
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // We want to legalize GlobalAddress and ConstantPool nodes into the
+  // appropriate instructions to materialize the address.
+  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
+       ++sctype) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
+
+    setOperationAction(ISD::GlobalAddress,  VT, Custom);
+    setOperationAction(ISD::ConstantPool,   VT, Custom);
+    setOperationAction(ISD::JumpTable,      VT, Custom);
+  }
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
+
+  // Cell SPU has instructions for converting between i64 and fp.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+
+  // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
+
+  // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
+  // First set operation action for all vector types to expand. Then we
+  // will selectively turn on ones that can be effectively codegen'd.
+  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
+
+  for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
+
+    // Set operation actions to legal types only.
+    if (!isTypeLegal(VT)) continue;
+
+    // add/sub are legal for all supported vector VT's.
+    setOperationAction(ISD::ADD,     VT, Legal);
+    setOperationAction(ISD::SUB,     VT, Legal);
+    // mul has to be custom lowered.
+    setOperationAction(ISD::MUL,     VT, Legal);
+
+    setOperationAction(ISD::AND,     VT, Legal);
+    setOperationAction(ISD::OR,      VT, Legal);
+    setOperationAction(ISD::XOR,     VT, Legal);
+    setOperationAction(ISD::LOAD,    VT, Custom);
+    setOperationAction(ISD::SELECT,  VT, Legal);
+    setOperationAction(ISD::STORE,   VT, Custom);
+
+    // These operations need to be expanded:
+    setOperationAction(ISD::SDIV,    VT, Expand);
+    setOperationAction(ISD::SREM,    VT, Expand);
+    setOperationAction(ISD::UDIV,    VT, Expand);
+    setOperationAction(ISD::UREM,    VT, Expand);
+
+    // Expand all trunc stores
+    for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
+      MVT::SimpleValueType TargetVT = (MVT::SimpleValueType)j;
+    setTruncStoreAction(VT, TargetVT, Expand);
+    }
+
+    // Custom lower build_vector, constant pool spills, insert and
+    // extract vector elements:
+    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+    setOperationAction(ISD::ConstantPool, VT, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+  }
+
+  setOperationAction(ISD::SHL, MVT::v2i64, Expand);
+
+  setOperationAction(ISD::AND, MVT::v16i8, Custom);
+  setOperationAction(ISD::OR,  MVT::v16i8, Custom);
+  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
+  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct?
+
+  setStackPointerRegisterToSaveRestore(SPU::R1);
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+
+  setMinFunctionAlignment(3);
+
+  computeRegisterProperties();
+
+  // Set pre-RA register scheduler default to BURR, which produces slightly
+  // better code than the default (could also be TDRR, but TargetLowering.h
+  // needs a mod to support that model):
+  setSchedulingPreference(Sched::RegPressure);
+}
+
+const char *SPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case SPUISD::RET_FLAG: return "SPUISD::RET_FLAG";
+  case SPUISD::Hi: return "SPUISD::Hi";
+  case SPUISD::Lo: return "SPUISD::Lo";
+  case SPUISD::PCRelAddr: return "SPUISD::PCRelAddr";
+  case SPUISD::AFormAddr: return "SPUISD::AFormAddr";
+  case SPUISD::IndirectAddr: return "SPUISD::IndirectAddr";
+  case SPUISD::LDRESULT: return "SPUISD::LDRESULT";
+  case SPUISD::CALL: return "SPUISD::CALL";
+  case SPUISD::SHUFB: return "SPUISD::SHUFB";
+  case SPUISD::SHUFFLE_MASK: return "SPUISD::SHUFFLE_MASK";
+  case SPUISD::CNTB: return "SPUISD::CNTB";
+  case SPUISD::PREFSLOT2VEC: return "SPUISD::PREFSLOT2VEC";
+  case SPUISD::VEC2PREFSLOT: return "SPUISD::VEC2PREFSLOT";
+  case SPUISD::SHL_BITS: return "SPUISD::SHL_BITS";
+  case SPUISD::SHL_BYTES: return "SPUISD::SHL_BYTES";
+  case SPUISD::VEC_ROTL: return "SPUISD::VEC_ROTL";
+  case SPUISD::VEC_ROTR: return "SPUISD::VEC_ROTR";
+  case SPUISD::ROTBYTES_LEFT: return "SPUISD::ROTBYTES_LEFT";
+  case SPUISD::ROTBYTES_LEFT_BITS: return "SPUISD::ROTBYTES_LEFT_BITS";
+  case SPUISD::SELECT_MASK: return "SPUISD::SELECT_MASK";
+  case SPUISD::SELB: return "SPUISD::SELB";
+  case SPUISD::ADD64_MARKER: return "SPUISD::ADD64_MARKER";
+  case SPUISD::SUB64_MARKER: return "SPUISD::SUB64_MARKER";
+  case SPUISD::MUL64_MARKER: return "SPUISD::MUL64_MARKER";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Return the Cell SPU's SETCC result type
+//===----------------------------------------------------------------------===//
+
+EVT SPUTargetLowering::getSetCCResultType(EVT VT) const {
+  // i8, i16 and i32 are valid SETCC result types
+  MVT::SimpleValueType retval;
+
+  switch(VT.getSimpleVT().SimpleTy){
+    case MVT::i1:
+    case MVT::i8:
+      retval = MVT::i8; break;
+    case MVT::i16:
+      retval = MVT::i16; break;
+    case MVT::i32:
+    default:
+      retval = MVT::i32;
+  }
+  return retval;
+}
+
+//===----------------------------------------------------------------------===//
+// Calling convention code:
+//===----------------------------------------------------------------------===//
+
+#include "SPUGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+/// Custom lower loads for CellSPU
+/*!
+ All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
+ within a 16-byte block, we have to rotate to extract the requested element.
+
+ For extending loads, we also want to ensure that the following sequence is
+ emitted, e.g. for MVT::f32 extending load to MVT::f64:
+
+\verbatim
+%1  v16i8,ch = load
+%2  v16i8,ch = rotate %1
+%3  v4f8, ch = bitconvert %2
+%4  f32      = vec2perfslot %3
+%5  f64      = fp_extend %4
+\endverbatim
+*/
+static SDValue
+LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  LoadSDNode *LN = cast<LoadSDNode>(Op);
+  SDValue the_chain = LN->getChain();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT InVT = LN->getMemoryVT();
+  EVT OutVT = Op.getValueType();
+  ISD::LoadExtType ExtType = LN->getExtensionType();
+  unsigned alignment = LN->getAlignment();
+  int pso = prefslotOffset(InVT);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
+                                                  (128 / InVT.getSizeInBits()));
+
+  // two sanity checks
+  assert( LN->getAddressingMode() == ISD::UNINDEXED
+          && "we should get only UNINDEXED adresses");
+  // clean aligned loads can be selected as-is
+  if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
+    return SDValue();
+
+  // Get pointerinfos to the memory chunk(s) that contain the data to load
+  uint64_t mpi_offset = LN->getPointerInfo().Offset;
+  mpi_offset -= mpi_offset%16;
+  MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
+  MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
+
+  SDValue result;
+  SDValue basePtr = LN->getBasePtr();
+  SDValue rotate;
+
+  if ((alignment%16) == 0) {
+    ConstantSDNode *CN;
+
+    // Special cases for a known aligned load to simplify the base pointer
+    // and the rotation amount:
+    if (basePtr.getOpcode() == ISD::ADD
+        && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
+      // Known offset into basePtr
+      int64_t offset = CN->getSExtValue();
+      int64_t rotamt = int64_t((offset & 0xf) - pso);
+
+      if (rotamt < 0)
+        rotamt += 16;
+
+      rotate = DAG.getConstant(rotamt, MVT::i16);
+
+      // Simplify the base pointer for this case:
+      basePtr = basePtr.getOperand(0);
+      if ((offset & ~0xf) > 0) {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                              basePtr,
+                              DAG.getConstant((offset & ~0xf), PtrVT));
+      }
+    } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
+               || (basePtr.getOpcode() == SPUISD::IndirectAddr
+                   && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
+                   && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
+      // Plain aligned a-form address: rotate into preferred slot
+      // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
+      int64_t rotamt = -pso;
+      if (rotamt < 0)
+        rotamt += 16;
+      rotate = DAG.getConstant(rotamt, MVT::i16);
+    } else {
+      // Offset the rotate amount by the basePtr and the preferred slot
+      // byte offset
+      int64_t rotamt = -pso;
+      if (rotamt < 0)
+        rotamt += 16;
+      rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
+                           basePtr,
+                           DAG.getConstant(rotamt, PtrVT));
+    }
+  } else {
+    // Unaligned load: must be more pessimistic about addressing modes:
+    if (basePtr.getOpcode() == ISD::ADD) {
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+      SDValue Flag;
+
+      SDValue Op0 = basePtr.getOperand(0);
+      SDValue Op1 = basePtr.getOperand(1);
+
+      if (isa<ConstantSDNode>(Op1)) {
+        // Convert the (add <ptr>, <const>) to an indirect address contained
+        // in a register. Note that this is done because we need to avoid
+        // creating a 0(reg) d-form address due to the SPU's block loads.
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+        the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
+        basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
+      } else {
+        // Convert the (add <arg1>, <arg2>) to an indirect address, which
+        // will likely be lowered as a reg(reg) x-form address.
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+      }
+    } else {
+      basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                            basePtr,
+                            DAG.getConstant(0, PtrVT));
+   }
+
+    // Offset the rotate amount by the basePtr and the preferred slot
+    // byte offset
+    rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         basePtr,
+                         DAG.getConstant(-pso, PtrVT));
+  }
+
+  // Do the load as a i128 to allow possible shifting
+  SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
+                       lowMemPtr,
+                       LN->isVolatile(), LN->isNonTemporal(), false, 16);
+
+  // When the size is not greater than alignment we get all data with just
+  // one load
+  if (alignment >= InVT.getSizeInBits()/8) {
+    // Update the chain
+    the_chain = low.getValue(1);
+
+    // Rotate into the preferred slot:
+    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
+                         low.getValue(0), rotate);
+
+    // Convert the loaded v16i8 vector to the appropriate vector type
+    // specified by the operand:
+    EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
+                                 InVT, (128 / InVT.getSizeInBits()));
+    result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
+                         DAG.getNode(ISD::BITCAST, dl, vecVT, result));
+  }
+  // When alignment is less than the size, we might need (known only at
+  // run-time) two loads
+  // TODO: if the memory address is composed only from constants, we have
+  // extra kowledge, and might avoid the second load
+  else {
+    // storage position offset from lower 16 byte aligned memory chunk
+    SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
+                                  basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
+    // get a registerfull of ones. (this implementation is a workaround: LLVM
+    // cannot handle 128 bit signed int constants)
+    SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
+    ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
+
+    SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
+                               DAG.getNode(ISD::ADD, dl, PtrVT,
+                                           basePtr,
+                                           DAG.getConstant(16, PtrVT)),
+                               highMemPtr,
+                               LN->isVolatile(), LN->isNonTemporal(), false, 
+                               16);
+
+    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
+                                                              high.getValue(1));
+
+    // Shift the (possible) high part right to compensate the misalignemnt.
+    // if there is no highpart (i.e. value is i64 and offset is 4), this
+    // will zero out the high value.
+    high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
+                                     DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                                 DAG.getConstant( 16, MVT::i32),
+                                                 offset
+                                                ));
+
+    // Shift the low similarly
+    // TODO: add SPUISD::SHL_BYTES
+    low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
+
+    // Merge the two parts
+    result = DAG.getNode(ISD::BITCAST, dl, vecVT,
+                          DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
+
+    if (!InVT.isVector()) {
+      result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
+     }
+
+  }
+    // Handle extending loads by extending the scalar result:
+    if (ExtType == ISD::SEXTLOAD) {
+      result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
+    } else if (ExtType == ISD::EXTLOAD) {
+      unsigned NewOpc = ISD::ANY_EXTEND;
+
+      if (OutVT.isFloatingPoint())
+        NewOpc = ISD::FP_EXTEND;
+
+      result = DAG.getNode(NewOpc, dl, OutVT, result);
+    }
+
+    SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
+    SDValue retops[2] = {
+      result,
+      the_chain
+    };
+
+    result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
+                         retops, sizeof(retops) / sizeof(retops[0]));
+    return result;
+}
+
+/// Custom lower stores for CellSPU
+/*!
+ All CellSPU stores are aligned to 16-byte boundaries, so for elements
+ within a 16-byte block, we have to generate a shuffle to insert the
+ requested element into its place, then store the resulting block.
+ */
+static SDValue
+LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  StoreSDNode *SN = cast<StoreSDNode>(Op);
+  SDValue Value = SN->getValue();
+  EVT VT = Value.getValueType();
+  EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned alignment = SN->getAlignment();
+  SDValue result;
+  EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
+                                                 (128 / StVT.getSizeInBits()));
+  // Get pointerinfos to the memory chunk(s) that contain the data to load
+  uint64_t mpi_offset = SN->getPointerInfo().Offset;
+  mpi_offset -= mpi_offset%16;
+  MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
+  MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
+
+
+  // two sanity checks
+  assert( SN->getAddressingMode() == ISD::UNINDEXED
+          && "we should get only UNINDEXED adresses");
+  // clean aligned loads can be selected as-is
+  if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
+    return SDValue();
+
+  SDValue alignLoadVec;
+  SDValue basePtr = SN->getBasePtr();
+  SDValue the_chain = SN->getChain();
+  SDValue insertEltOffs;
+
+  if ((alignment%16) == 0) {
+    ConstantSDNode *CN;
+    // Special cases for a known aligned load to simplify the base pointer
+    // and insertion byte:
+    if (basePtr.getOpcode() == ISD::ADD
+        && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
+      // Known offset into basePtr
+      int64_t offset = CN->getSExtValue();
+
+      // Simplify the base pointer for this case:
+      basePtr = basePtr.getOperand(0);
+      insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                  basePtr,
+                                  DAG.getConstant((offset & 0xf), PtrVT));
+
+      if ((offset & ~0xf) > 0) {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                              basePtr,
+                              DAG.getConstant((offset & ~0xf), PtrVT));
+      }
+    } else {
+      // Otherwise, assume it's at byte 0 of basePtr
+      insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                  basePtr,
+                                  DAG.getConstant(0, PtrVT));
+      basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                  basePtr,
+                                  DAG.getConstant(0, PtrVT));
+    }
+  } else {
+    // Unaligned load: must be more pessimistic about addressing modes:
+    if (basePtr.getOpcode() == ISD::ADD) {
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+      SDValue Flag;
+
+      SDValue Op0 = basePtr.getOperand(0);
+      SDValue Op1 = basePtr.getOperand(1);
+
+      if (isa<ConstantSDNode>(Op1)) {
+        // Convert the (add <ptr>, <const>) to an indirect address contained
+        // in a register. Note that this is done because we need to avoid
+        // creating a 0(reg) d-form address due to the SPU's block loads.
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+        the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
+        basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
+      } else {
+        // Convert the (add <arg1>, <arg2>) to an indirect address, which
+        // will likely be lowered as a reg(reg) x-form address.
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+      }
+    } else {
+      basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                            basePtr,
+                            DAG.getConstant(0, PtrVT));
+    }
+
+    // Insertion point is solely determined by basePtr's contents
+    insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
+                                basePtr,
+                                DAG.getConstant(0, PtrVT));
+  }
+
+  // Load the lower part of the memory to which to store.
+  SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
+                          lowMemPtr, SN->isVolatile(), SN->isNonTemporal(),
+                            false, 16);
+
+  // if we don't need to store over the 16 byte boundary, one store suffices
+  if (alignment >= StVT.getSizeInBits()/8) {
+    // Update the chain
+    the_chain = low.getValue(1);
+
+    LoadSDNode *LN = cast<LoadSDNode>(low);
+    SDValue theValue = SN->getValue();
+
+    if (StVT != VT
+        && (theValue.getOpcode() == ISD::AssertZext
+            || theValue.getOpcode() == ISD::AssertSext)) {
+      // Drill down and get the value for zero- and sign-extended
+      // quantities
+      theValue = theValue.getOperand(0);
+    }
+
+    // If the base pointer is already a D-form address, then just create
+    // a new D-form address with a slot offset and the orignal base pointer.
+    // Otherwise generate a D-form address with the slot offset relative
+    // to the stack pointer, which is always aligned.
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        errs() << "CellSPU LowerSTORE: basePtr = ";
+        basePtr.getNode()->dump(&DAG);
+        errs() << "\n";
+      }
+#endif
+
+    SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
+                                      insertEltOffs);
+    SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
+                                      theValue);
+
+    result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
+                         vectorizeOp, low,
+                         DAG.getNode(ISD::BITCAST, dl,
+                                     MVT::v4i32, insertEltOp));
+
+    result = DAG.getStore(the_chain, dl, result, basePtr,
+                          lowMemPtr,
+                          LN->isVolatile(), LN->isNonTemporal(),
+                          16);
+
+  }
+  // do the store when it might cross the 16 byte memory access boundary.
+  else {
+    // TODO issue a warning if SN->isVolatile()== true? This is likely not
+    // what the user wanted.
+
+    // address offset from nearest lower 16byte alinged address
+    SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
+                                    SN->getBasePtr(),
+                                    DAG.getConstant(0xf, MVT::i32));
+    // 16 - offset
+    SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                           DAG.getConstant( 16, MVT::i32),
+                                           offset);
+    // 16 - sizeof(Value)
+    SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                     DAG.getConstant( 16, MVT::i32),
+                                     DAG.getConstant( VT.getSizeInBits()/8,
+                                                      MVT::i32));
+    // get a registerfull of ones
+    SDValue ones = DAG.getConstant(-1, MVT::v4i32);
+    ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
+
+    // Create the 128 bit masks that have ones where the data to store is
+    // located.
+    SDValue lowmask, himask;
+    // if the value to store don't fill up the an entire 128 bits, zero
+    // out the last bits of the mask so that only the value we want to store
+    // is masked.
+    // this is e.g. in the case of store i32, align 2
+    if (!VT.isVector()){
+      Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
+      lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
+      lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
+                                                               surplus);
+      Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
+      Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
+
+    }
+    else {
+      lowmask = ones;
+      Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
+    }
+    // this will zero, if there are no data that goes to the high quad
+    himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
+                                                            offset_compl);
+    lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
+                                                             offset);
+
+    // Load in the old data and zero out the parts that will be overwritten with
+    // the new data to store.
+    SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
+                               DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
+                                           DAG.getConstant( 16, PtrVT)),
+                               highMemPtr,
+                               SN->isVolatile(), SN->isNonTemporal(), 
+                               false, 16);
+    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
+                                                              hi.getValue(1));
+
+    low = DAG.getNode(ISD::AND, dl, MVT::i128,
+                        DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
+                        DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
+    hi = DAG.getNode(ISD::AND, dl, MVT::i128,
+                        DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
+                        DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
+
+    // Shift the Value to store into place. rlow contains the parts that go to
+    // the lower memory chunk, rhi has the parts that go to the upper one.
+    SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
+    rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
+    SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
+                                                            offset_compl);
+
+    // Merge the old data and the new data and store the results
+    // Need to convert vectors here to integer as 'OR'ing floats assert
+    rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
+                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
+                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
+    rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
+                         DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
+
+    low = DAG.getStore(the_chain, dl, rlow, basePtr,
+                          lowMemPtr,
+                          SN->isVolatile(), SN->isNonTemporal(), 16);
+    hi  = DAG.getStore(the_chain, dl, rhi,
+                            DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
+                                        DAG.getConstant( 16, PtrVT)),
+                            highMemPtr,
+                            SN->isVolatile(), SN->isNonTemporal(), 16);
+    result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
+                                                           hi.getValue(0));
+  }
+
+  return result;
+}
+
+//! Generate the address of a constant pool entry.
+static SDValue
+LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  EVT PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  const Constant *C = CP->getConstVal();
+  SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  const TargetMachine &TM = DAG.getTarget();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      // Just return the SDValue with the constant pool address in it.
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
+    } else {
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
+    }
+  }
+
+  llvm_unreachable("LowerConstantPool: Relocation model other than static"
+                   " not supported.");
+}
+
+//! Alternate entry point for generating the address of a constant pool entry
+SDValue
+SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
+  return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
+}
+
+static SDValue
+LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  const TargetMachine &TM = DAG.getTarget();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
+    } else {
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
+    }
+  }
+
+  llvm_unreachable("LowerJumpTable: Relocation model other than static"
+                   " not supported.");
+}
+
+static SDValue
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  EVT PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GSDN->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+                                          PtrVT, GSDN->getOffset());
+  const TargetMachine &TM = DAG.getTarget();
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
+    } else {
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
+    }
+  } else {
+    report_fatal_error("LowerGlobalAddress: Relocation model other than static"
+                      "not supported.");
+    /*NOTREACHED*/
+  }
+}
+
+//! Custom lower double precision floating point constants
+static SDValue
+LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (VT == MVT::f64) {
+    ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
+
+    assert((FP != 0) &&
+           "LowerConstantFP: Node is not ConstantFPSDNode");
+
+    uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
+    SDValue T = DAG.getConstant(dbits, MVT::i64);
+    SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                       DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
+  }
+
+  return SDValue();
+}
+
+SDValue
+SPUTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
+
+  unsigned ArgOffset = SPUFrameLowering::minStackSize();
+  unsigned ArgRegIdx = 0;
+  unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
+  // FIXME: allow for other calling conventions
+  CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
+
+  // Add DAG nodes to load the arguments or copy them out of registers.
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    SDValue ArgVal;
+    CCValAssign &VA = ArgLocs[ArgNo];
+
+    if (VA.isRegLoc()) {
+      const TargetRegisterClass *ArgRegClass;
+
+      switch (ObjectVT.getSimpleVT().SimpleTy) {
+      default:
+        report_fatal_error("LowerFormalArguments Unhandled argument type: " +
+                           Twine(ObjectVT.getEVTString()));
+      case MVT::i8:
+        ArgRegClass = &SPU::R8CRegClass;
+        break;
+      case MVT::i16:
+        ArgRegClass = &SPU::R16CRegClass;
+        break;
+      case MVT::i32:
+        ArgRegClass = &SPU::R32CRegClass;
+        break;
+      case MVT::i64:
+        ArgRegClass = &SPU::R64CRegClass;
+        break;
+      case MVT::i128:
+        ArgRegClass = &SPU::GPRCRegClass;
+        break;
+      case MVT::f32:
+        ArgRegClass = &SPU::R32FPRegClass;
+        break;
+      case MVT::f64:
+        ArgRegClass = &SPU::R64FPRegClass;
+        break;
+      case MVT::v2f64:
+      case MVT::v4f32:
+      case MVT::v2i64:
+      case MVT::v4i32:
+      case MVT::v8i16:
+      case MVT::v16i8:
+        ArgRegClass = &SPU::VECREGRegClass;
+        break;
+      }
+
+      unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
+      RegInfo.addLiveIn(VA.getLocReg(), VReg);
+      ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+      ++ArgRegIdx;
+    } else {
+      // We need to load the argument to a virtual register if we determined
+      // above that we ran out of physical registers of the appropriate type
+      // or we're forced to do vararg
+      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
+                           false, false, false, 0);
+      ArgOffset += StackSlotSize;
+    }
+
+    InVals.push_back(ArgVal);
+    // Update the chain
+    Chain = ArgVal.getOperand(0);
+  }
+
+  // vararg handling:
+  if (isVarArg) {
+    // FIXME: we should be able to query the argument registers from
+    //        tablegen generated code.
+    static const uint16_t ArgRegs[] = {
+      SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
+      SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
+      SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
+      SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
+      SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
+      SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
+      SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
+      SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
+      SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
+      SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
+      SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
+    };
+    // size of ArgRegs array
+    const unsigned NumArgRegs = 77;
+
+    // We will spill (79-3)+1 registers to the stack
+    SmallVector<SDValue, 79-3+1> MemOps;
+
+    // Create the frame slot
+    for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
+      FuncInfo->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
+      SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
+      SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
+      SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
+                                   false, false, 0);
+      Chain = Store.getOperand(0);
+      MemOps.push_back(Store);
+
+      // Increment address by stack slot size for the next stored argument
+      ArgOffset += StackSlotSize;
+    }
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOps[0], MemOps.size());
+  }
+
+  return Chain;
+}
+
+/// isLSAAddress - Return the immediate to use if the specified
+/// value is representable as a LSA address.
+static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return 0;
+
+  int Addr = C->getZExtValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      (Addr << 14 >> 14) != Addr)
+    return 0;  // Top 14 bits have to be sext of immediate.
+
+  return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
+}
+
+SDValue
+SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool doesNotRet, bool &isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  // CellSPU target does not yet support tail call optimization.
+  isTailCall = false;
+
+  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
+  unsigned NumOps     = Outs.size();
+  unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
+  // FIXME: allow for other calling conventions
+  CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
+
+  const unsigned NumArgRegs = ArgLocs.size();
+
+
+  // Handy pointer type
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.
+  unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
+  unsigned ArgRegIdx = 0;
+
+  // Keep track of registers passing arguments
+  std::vector<std::pair<unsigned, SDValue> > RegsToPass;
+  // And the arguments passed on the stack
+  SmallVector<SDValue, 8> MemOpChains;
+
+  for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
+    SDValue Arg = OutVals[ArgRegIdx];
+    CCValAssign &VA = ArgLocs[ArgRegIdx];
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::i128:
+    case MVT::f32:
+    case MVT::f64:
+    case MVT::v2i64:
+    case MVT::v2f64:
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+        ArgOffset += StackSlotSize;
+      }
+      break;
+    }
+  }
+
+  // Accumulate how many bytes are to be pushed on the stack, including the
+  // linkage area, and parameter passing area.  According to the SPU ABI,
+  // we minimally need space for [LR] and [SP].
+  unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
+
+  // Insert a call sequence start
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
+                                                            true));
+
+  if (!MemOpChains.empty()) {
+    // Adjust the stack pointer for the stack arguments.
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  SmallVector<SDValue, 8> Ops;
+  unsigned CallOpc = SPUISD::CALL;
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    EVT CalleeVT = Callee.getValueType();
+    SDValue Zero = DAG.getConstant(0, PtrVT);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
+
+    if (!ST->usingLargeMem()) {
+      // Turn calls to targets that are defined (i.e., have bodies) into BRSL
+      // style calls, otherwise, external symbols are BRASL calls. This assumes
+      // that declared/defined symbols are in the same compilation unit and can
+      // be reached through PC-relative jumps.
+      //
+      // NOTE:
+      // This may be an unsafe assumption for JIT and really large compilation
+      // units.
+      if (GV->isDeclaration()) {
+        Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
+      } else {
+        Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
+      }
+    } else {
+      // "Large memory" mode: Turn all calls into indirect calls with a X-form
+      // address pairs:
+      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    EVT CalleeVT = Callee.getValueType();
+    SDValue Zero = DAG.getConstant(0, PtrVT);
+    SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
+        Callee.getValueType());
+
+    if (!ST->usingLargeMem()) {
+      Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
+    } else {
+      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
+    }
+  } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
+    // If this is an absolute destination address that appears to be a legal
+    // local store address, use the munged value.
+    Callee = SDValue(Dest, 0);
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
+                      &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // If the function returns void, just return the chain.
+  if (Ins.empty())
+    return Chain;
+
+  // Now handle the return value(s)
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		    getTargetMachine(), RVLocs, *DAG.getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
+
+
+  // If the call has results, copy the values out of the ret val registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                                     InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+    InVals.push_back(Val);
+   }
+
+  return Chain;
+}
+
+SDValue
+SPUTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               DebugLoc dl, SelectionDAG &DAG) const {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             OutVals[i], Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  else
+    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering:
+//===----------------------------------------------------------------------===//
+
+static ConstantSDNode *
+getVecImm(SDNode *N) {
+  SDValue OpVal(0, 0);
+
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return 0;
+  }
+
+  if (OpVal.getNode() != 0) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+      return CN;
+    }
+  }
+
+  return 0;
+}
+
+/// get_vec_i18imm - Test if this vector is a vector filled with the same value
+/// and the value fits into an unsigned 18-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
+                              EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    uint64_t Value = CN->getZExtValue();
+    if (ValueType == MVT::i64) {
+      uint64_t UValue = CN->getZExtValue();
+      uint32_t upper = uint32_t(UValue >> 32);
+      uint32_t lower = uint32_t(UValue);
+      if (upper != lower)
+        return SDValue();
+      Value = Value >> 32;
+    }
+    if (Value <= 0x3ffff)
+      return DAG.getTargetConstant(Value, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_vec_i16imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 16-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
+                              EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int64_t Value = CN->getSExtValue();
+    if (ValueType == MVT::i64) {
+      uint64_t UValue = CN->getZExtValue();
+      uint32_t upper = uint32_t(UValue >> 32);
+      uint32_t lower = uint32_t(UValue);
+      if (upper != lower)
+        return SDValue();
+      Value = Value >> 32;
+    }
+    if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
+      return DAG.getTargetConstant(Value, ValueType);
+    }
+  }
+
+  return SDValue();
+}
+
+/// get_vec_i10imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 10-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
+                              EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int64_t Value = CN->getSExtValue();
+    if (ValueType == MVT::i64) {
+      uint64_t UValue = CN->getZExtValue();
+      uint32_t upper = uint32_t(UValue >> 32);
+      uint32_t lower = uint32_t(UValue);
+      if (upper != lower)
+        return SDValue();
+      Value = Value >> 32;
+    }
+    if (isInt<10>(Value))
+      return DAG.getTargetConstant(Value, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_vec_i8imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 8-bit constant, and if so, return the
+/// constant.
+///
+/// @note: The incoming vector is v16i8 because that's the only way we can load
+/// constant vectors. Thus, we test to see if the upper and lower bytes are the
+/// same value.
+SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
+                             EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int Value = (int) CN->getZExtValue();
+    if (ValueType == MVT::i16
+        && Value <= 0xffff                 /* truncated from uint64_t */
+        && ((short) Value >> 8) == ((short) Value & 0xff))
+      return DAG.getTargetConstant(Value & 0xff, ValueType);
+    else if (ValueType == MVT::i8
+             && (Value & 0xff) == Value)
+      return DAG.getTargetConstant(Value, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 16-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
+                               EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    uint64_t Value = CN->getZExtValue();
+    if ((ValueType == MVT::i32
+          && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
+        || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
+      return DAG.getTargetConstant(Value >> 16, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_v4i32_imm - Catch-all for general 32-bit constant vectors
+SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
+  }
+
+  return SDValue();
+}
+
+/// get_v4i32_imm - Catch-all for general 64-bit constant vectors
+SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
+  }
+
+  return SDValue();
+}
+
+//! Lower a BUILD_VECTOR instruction creatively:
+static SDValue
+LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+  BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
+  unsigned minSplatBits = EltVT.getSizeInBits();
+
+  if (minSplatBits < 16)
+    minSplatBits = 16;
+
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                            HasAnyUndefs, minSplatBits)
+      || minSplatBits < SplatBitSize)
+    return SDValue();   // Wasn't a constant vector or splat exceeded min
+
+  uint64_t SplatBits = APSplatBits.getZExtValue();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
+                       Twine(VT.getEVTString()));
+    /*NOTREACHED*/
+  case MVT::v4f32: {
+    uint32_t Value32 = uint32_t(SplatBits);
+    assert(SplatBitSize == 32
+           && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
+    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
+    SDValue T = DAG.getConstant(Value32, MVT::i32);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
+  }
+  case MVT::v2f64: {
+    uint64_t f64val = uint64_t(SplatBits);
+    assert(SplatBitSize == 64
+           && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
+    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
+    SDValue T = DAG.getConstant(f64val, MVT::i64);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
+  }
+  case MVT::v16i8: {
+   // 8-bit constants have to be expanded to 16-bits
+   unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
+   SmallVector<SDValue, 8> Ops;
+
+   Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
+   return DAG.getNode(ISD::BITCAST, dl, VT,
+                      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
+  }
+  case MVT::v8i16: {
+    unsigned short Value16 = SplatBits;
+    SDValue T = DAG.getConstant(Value16, EltVT);
+    SmallVector<SDValue, 8> Ops;
+
+    Ops.assign(8, T);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
+  }
+  case MVT::v4i32: {
+    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
+  }
+  case MVT::v2i64: {
+    return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
+  }
+  }
+}
+
+/*!
+ */
+SDValue
+SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
+                     DebugLoc dl) {
+  uint32_t upper = uint32_t(SplatVal >> 32);
+  uint32_t lower = uint32_t(SplatVal);
+
+  if (upper == lower) {
+    // Magic constant that can be matched by IL, ILA, et. al.
+    SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
+    return DAG.getNode(ISD::BITCAST, dl, OpVT,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   Val, Val, Val, Val));
+  } else {
+    bool upper_special, lower_special;
+
+    // NOTE: This code creates common-case shuffle masks that can be easily
+    // detected as common expressions. It is not attempting to create highly
+    // specialized masks to replace any and all 0's, 0xff's and 0x80's.
+
+    // Detect if the upper or lower half is a special shuffle mask pattern:
+    upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
+    lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
+
+    // Both upper and lower are special, lower to a constant pool load:
+    if (lower_special && upper_special) {
+      SDValue UpperVal = DAG.getConstant(upper, MVT::i32);
+      SDValue LowerVal = DAG.getConstant(lower, MVT::i32);
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                         UpperVal, LowerVal, UpperVal, LowerVal);
+      return DAG.getNode(ISD::BITCAST, dl, OpVT, BV);
+    }
+
+    SDValue LO32;
+    SDValue HI32;
+    SmallVector<SDValue, 16> ShufBytes;
+    SDValue Result;
+
+    // Create lower vector if not a special pattern
+    if (!lower_special) {
+      SDValue LO32C = DAG.getConstant(lower, MVT::i32);
+      LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                     LO32C, LO32C, LO32C, LO32C));
+    }
+
+    // Create upper vector if not a special pattern
+    if (!upper_special) {
+      SDValue HI32C = DAG.getConstant(upper, MVT::i32);
+      HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                     HI32C, HI32C, HI32C, HI32C));
+    }
+
+    // If either upper or lower are special, then the two input operands are
+    // the same (basically, one of them is a "don't care")
+    if (lower_special)
+      LO32 = HI32;
+    if (upper_special)
+      HI32 = LO32;
+
+    for (int i = 0; i < 4; ++i) {
+      uint64_t val = 0;
+      for (int j = 0; j < 4; ++j) {
+        SDValue V;
+        bool process_upper, process_lower;
+        val <<= 8;
+        process_upper = (upper_special && (i & 1) == 0);
+        process_lower = (lower_special && (i & 1) == 1);
+
+        if (process_upper || process_lower) {
+          if ((process_upper && upper == 0)
+                  || (process_lower && lower == 0))
+            val |= 0x80;
+          else if ((process_upper && upper == 0xffffffff)
+                  || (process_lower && lower == 0xffffffff))
+            val |= 0xc0;
+          else if ((process_upper && upper == 0x80000000)
+                  || (process_lower && lower == 0x80000000))
+            val |= (j == 0 ? 0xe0 : 0x80);
+        } else
+          val |= i * 4 + j + ((i & 1) * 16);
+      }
+
+      ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
+    }
+
+    return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   &ShufBytes[0], ShufBytes.size()));
+  }
+}
+
+/// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
+/// which the Cell can operate. The code inspects V3 to ascertain whether the
+/// permutation vector, V3, is monotonically increasing with one "exception"
+/// element, e.g., (0, 1, _, 3). If this is the case, then generate a
+/// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
+/// In either case, the net result is going to eventually invoke SHUFB to
+/// permute/shuffle the bytes from V1 and V2.
+/// \note
+/// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
+/// control word for byte/halfword/word insertion. This takes care of a single
+/// element move from V2 into V1.
+/// \note
+/// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+
+  // If we have a single element being moved from V1 to V2, this can be handled
+  // using the C*[DX] compute mask instructions, but the vector elements have
+  // to be monotonically increasing with one exception element, and the source
+  // slot of the element to move must be the same as the destination.
+  EVT VecVT = V1.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned EltsFromV2 = 0;
+  unsigned V2EltOffset = 0;
+  unsigned V2EltIdx0 = 0;
+  unsigned CurrElt = 0;
+  unsigned MaxElts = VecVT.getVectorNumElements();
+  unsigned PrevElt = 0;
+  bool monotonic = true;
+  bool rotate = true;
+  int rotamt=0;
+  EVT maskVT;             // which of the c?d instructions to use
+
+  if (EltVT == MVT::i8) {
+    V2EltIdx0 = 16;
+    maskVT = MVT::v16i8;
+  } else if (EltVT == MVT::i16) {
+    V2EltIdx0 = 8;
+    maskVT = MVT::v8i16;
+  } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
+    V2EltIdx0 = 4;
+    maskVT = MVT::v4i32;
+  } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
+    V2EltIdx0 = 2;
+    maskVT = MVT::v2i64;
+  } else
+    llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
+
+  for (unsigned i = 0; i != MaxElts; ++i) {
+    if (SVN->getMaskElt(i) < 0)
+      continue;
+
+    unsigned SrcElt = SVN->getMaskElt(i);
+
+    if (monotonic) {
+      if (SrcElt >= V2EltIdx0) {
+        // TODO: optimize for the monotonic case when several consecutive
+        // elements are taken form V2. Do we ever get such a case?
+        if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
+          V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
+        else
+          monotonic = false;
+        ++EltsFromV2;
+      } else if (CurrElt != SrcElt) {
+        monotonic = false;
+      }
+
+      ++CurrElt;
+    }
+
+    if (rotate) {
+      if (PrevElt > 0 && SrcElt < MaxElts) {
+        if ((PrevElt == SrcElt - 1)
+            || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
+          PrevElt = SrcElt;
+        } else {
+          rotate = false;
+        }
+      } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
+        // First time or after a "wrap around"
+        rotamt = SrcElt-i;
+        PrevElt = SrcElt;
+      } else {
+        // This isn't a rotation, takes elements from vector 2
+        rotate = false;
+      }
+    }
+  }
+
+  if (EltsFromV2 == 1 && monotonic) {
+    // Compute mask and shuffle
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+    // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
+    // R1 ($sp) is used here only as it is guaranteed to have last bits zero
+    SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                DAG.getRegister(SPU::R1, PtrVT),
+                                DAG.getConstant(V2EltOffset, MVT::i32));
+    SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
+                                     maskVT, Pointer);
+
+    // Use shuffle mask in SHUFB synthetic instruction:
+    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
+                       ShufMaskOp);
+  } else if (rotate) {
+    if (rotamt < 0)
+      rotamt +=MaxElts;
+    rotamt *= EltVT.getSizeInBits()/8;
+    return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
+                       V1, DAG.getConstant(rotamt, MVT::i16));
+  } else {
+   // Convert the SHUFFLE_VECTOR mask's input element units to the
+   // actual bytes.
+    unsigned BytesPerElement = EltVT.getSizeInBits()/8;
+
+    SmallVector<SDValue, 16> ResultMask;
+    for (unsigned i = 0, e = MaxElts; i != e; ++i) {
+      unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
+
+      for (unsigned j = 0; j < BytesPerElement; ++j)
+        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
+    }
+    SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
+                                    &ResultMask[0], ResultMask.size());
+    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
+  }
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op0.getNode()->getOpcode() == ISD::Constant) {
+    // For a constant, build the appropriate constant vector, which will
+    // eventually simplify to a vector register load.
+
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
+    SmallVector<SDValue, 16> ConstVecValues;
+    EVT VT;
+    size_t n_copies;
+
+    // Create a constant vector:
+    switch (Op.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected constant value type in "
+                              "LowerSCALAR_TO_VECTOR");
+    case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
+    case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
+    case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
+    case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
+    case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
+    case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
+    }
+
+    SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
+    for (size_t j = 0; j < n_copies; ++j)
+      ConstVecValues.push_back(CValue);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
+                       &ConstVecValues[0], ConstVecValues.size());
+  } else {
+    // Otherwise, copy the value from one register to another:
+    switch (Op0.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::f32:
+    case MVT::f64:
+      return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
+    }
+  }
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  SDValue N = Op.getOperand(0);
+  SDValue Elt = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue retval;
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+    // Constant argument:
+    int EltNo = (int) C->getZExtValue();
+
+    // sanity checks:
+    if (VT == MVT::i8 && EltNo >= 16)
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
+    else if (VT == MVT::i16 && EltNo >= 8)
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
+    else if (VT == MVT::i32 && EltNo >= 4)
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
+    else if (VT == MVT::i64 && EltNo >= 2)
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
+
+    if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
+      // i32 and i64: Element 0 is the preferred slot
+      return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
+    }
+
+    // Need to generate shuffle mask and extract:
+    int prefslot_begin = -1, prefslot_end = -1;
+    int elt_byte = EltNo * VT.getSizeInBits() / 8;
+
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Invalid value type!");
+    case MVT::i8: {
+      prefslot_begin = prefslot_end = 3;
+      break;
+    }
+    case MVT::i16: {
+      prefslot_begin = 2; prefslot_end = 3;
+      break;
+    }
+    case MVT::i32:
+    case MVT::f32: {
+      prefslot_begin = 0; prefslot_end = 3;
+      break;
+    }
+    case MVT::i64:
+    case MVT::f64: {
+      prefslot_begin = 0; prefslot_end = 7;
+      break;
+    }
+    }
+
+    assert(prefslot_begin != -1 && prefslot_end != -1 &&
+           "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
+
+    unsigned int ShufBytes[16] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
+    for (int i = 0; i < 16; ++i) {
+      // zero fill uppper part of preferred slot, don't care about the
+      // other slots:
+      unsigned int mask_val;
+      if (i <= prefslot_end) {
+        mask_val =
+          ((i < prefslot_begin)
+           ? 0x80
+           : elt_byte + (i - prefslot_begin));
+
+        ShufBytes[i] = mask_val;
+      } else
+        ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
+    }
+
+    SDValue ShufMask[4];
+    for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
+      unsigned bidx = i * 4;
+      unsigned int bits = ((ShufBytes[bidx] << 24) |
+                           (ShufBytes[bidx+1] << 16) |
+                           (ShufBytes[bidx+2] << 8) |
+                           ShufBytes[bidx+3]);
+      ShufMask[i] = DAG.getConstant(bits, MVT::i32);
+    }
+
+    SDValue ShufMaskVec =
+      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                  &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
+
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                         DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
+                                     N, N, ShufMaskVec));
+  } else {
+    // Variable index: Rotate the requested element into slot 0, then replicate
+    // slot 0 across the vector
+    EVT VecVT = N.getValueType();
+    if (!VecVT.isSimple() || !VecVT.isVector()) {
+      report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
+                        "vector type!");
+    }
+
+    // Make life easier by making sure the index is zero-extended to i32
+    if (Elt.getValueType() != MVT::i32)
+      Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
+
+    // Scale the index to a bit/byte shift quantity
+    APInt scaleFactor =
+            APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
+    unsigned scaleShift = scaleFactor.logBase2();
+    SDValue vecShift;
+
+    if (scaleShift > 0) {
+      // Scale the shift factor:
+      Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
+                        DAG.getConstant(scaleShift, MVT::i32));
+    }
+
+    vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
+
+    // Replicate the bytes starting at byte 0 across the entire vector (for
+    // consistency with the notion of a unified register set)
+    SDValue replicate;
+
+    switch (VT.getSimpleVT().SimpleTy) {
+    default:
+      report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
+                        "type");
+      /*NOTREACHED*/
+    case MVT::i8: {
+      SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
+      break;
+    }
+    case MVT::i16: {
+      SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
+      break;
+    }
+    case MVT::i32:
+    case MVT::f32: {
+      SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
+      break;
+    }
+    case MVT::i64:
+    case MVT::f64: {
+      SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
+      SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              loFactor, hiFactor, loFactor, hiFactor);
+      break;
+    }
+    }
+
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                         DAG.getNode(SPUISD::SHUFB, dl, VecVT,
+                                     vecShift, vecShift, replicate));
+  }
+
+  return retval;
+}
+
+static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  SDValue VecOp = Op.getOperand(0);
+  SDValue ValOp = Op.getOperand(1);
+  SDValue IdxOp = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT eltVT = ValOp.getValueType();
+
+  // use 0 when the lane to insert to is 'undef'
+  int64_t Offset=0;
+  if (IdxOp.getOpcode() != ISD::UNDEF) {
+    ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
+    assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
+    Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
+  }
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Use $sp ($1) because it's always 16-byte aligned and it's available:
+  SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                DAG.getRegister(SPU::R1, PtrVT),
+                                DAG.getConstant(Offset, PtrVT));
+  // widen the mask when dealing with half vectors
+  EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
+                                128/ VT.getVectorElementType().getSizeInBits());
+  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
+
+  SDValue result =
+    DAG.getNode(SPUISD::SHUFB, dl, VT,
+                DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
+                VecOp,
+                DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
+
+  return result;
+}
+
+static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
+                           const TargetLowering &TLI)
+{
+  SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
+  DebugLoc dl = Op.getDebugLoc();
+  EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
+
+  assert(Op.getValueType() == MVT::i8);
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled i8 math operator");
+  case ISD::ADD: {
+    // 8-bit addition: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDValue N1 = Op.getOperand(1);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+
+  }
+
+  case ISD::SUB: {
+    // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDValue N1 = Op.getOperand(1);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  case ISD::ROTR:
+  case ISD::ROTL: {
+    SDValue N1 = Op.getOperand(1);
+    EVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
+                       ? ISD::ZERO_EXTEND
+                       : ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    // Replicate lower 8-bits into upper 8:
+    SDValue ExpandArg =
+      DAG.getNode(ISD::OR, dl, MVT::i16, N0,
+                  DAG.getNode(ISD::SHL, dl, MVT::i16,
+                              N0, DAG.getConstant(8, MVT::i32)));
+
+    // Truncate back down to i8
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
+  }
+  case ISD::SRL:
+  case ISD::SHL: {
+    SDValue N1 = Op.getOperand(1);
+    EVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::ZERO_EXTEND;
+
+      if (N1.getValueType().bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
+
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  case ISD::SRA: {
+    SDValue N1 = Op.getOperand(1);
+    EVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::SIGN_EXTEND;
+
+      if (N1VT.bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  case ISD::MUL: {
+    SDValue N1 = Op.getOperand(1);
+
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  }
+}
+
+//! Lower byte immediate operations for v16i8 vectors:
+static SDValue
+LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
+  SDValue ConstVec;
+  SDValue Arg;
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  ConstVec = Op.getOperand(0);
+  Arg = Op.getOperand(1);
+  if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
+    if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
+      ConstVec = ConstVec.getOperand(0);
+    } else {
+      ConstVec = Op.getOperand(1);
+      Arg = Op.getOperand(0);
+      if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
+        ConstVec = ConstVec.getOperand(0);
+      }
+    }
+  }
+
+  if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
+    BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
+    assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
+
+    APInt APSplatBits, APSplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
+
+    if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                              HasAnyUndefs, minSplatBits)
+        && minSplatBits <= SplatBitSize) {
+      uint64_t SplatBits = APSplatBits.getZExtValue();
+      SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
+
+      SmallVector<SDValue, 16> tcVec;
+      tcVec.assign(16, tc);
+      return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
+    }
+  }
+
+  // These operations (AND, OR, XOR) are legal, they just couldn't be custom
+  // lowered.  Return the operation, rather than a null SDValue.
+  return Op;
+}
+
+//! Custom lowering for CTPOP (count population)
+/*!
+  Custom lowering code that counts the number ones in the input
+  operand. SPU has such an instruction, but it counts the number of
+  ones per byte, which then have to be accumulated.
+*/
+static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
+                               VT, (128 / VT.getSizeInBits()));
+  DebugLoc dl = Op.getDebugLoc();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Invalid value type!");
+  case MVT::i8: {
+    SDValue N = Op.getOperand(0);
+    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
+
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
+  }
+
+  case MVT::i16: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
+
+    SDValue N = Op.getOperand(0);
+    SDValue Elt0 = DAG.getConstant(0, MVT::i16);
+    SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
+    SDValue Shift1 = DAG.getConstant(8, MVT::i32);
+
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
+
+    // CNTB_result becomes the chain to which all of the virtual registers
+    // CNTB_reg, SUM1_reg become associated:
+    SDValue CNTB_result =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
+
+    SDValue CNTB_rescopy =
+      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
+
+    SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
+
+    return DAG.getNode(ISD::AND, dl, MVT::i16,
+                       DAG.getNode(ISD::ADD, dl, MVT::i16,
+                                   DAG.getNode(ISD::SRL, dl, MVT::i16,
+                                               Tmp1, Shift1),
+                                   Tmp1),
+                       Mask0);
+  }
+
+  case MVT::i32: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+    unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+
+    SDValue N = Op.getOperand(0);
+    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
+    SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
+    SDValue Shift1 = DAG.getConstant(16, MVT::i32);
+    SDValue Shift2 = DAG.getConstant(8, MVT::i32);
+
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
+
+    // CNTB_result becomes the chain to which all of the virtual registers
+    // CNTB_reg, SUM1_reg become associated:
+    SDValue CNTB_result =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
+
+    SDValue CNTB_rescopy =
+      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
+
+    SDValue Comp1 =
+      DAG.getNode(ISD::SRL, dl, MVT::i32,
+                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
+                  Shift1);
+
+    SDValue Sum1 =
+      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
+                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
+
+    SDValue Sum1_rescopy =
+      DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
+
+    SDValue Comp2 =
+      DAG.getNode(ISD::SRL, dl, MVT::i32,
+                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
+                  Shift2);
+    SDValue Sum2 =
+      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
+                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
+
+    return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
+  }
+
+  case MVT::i64:
+    break;
+  }
+
+  return SDValue();
+}
+
+//! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
+/*!
+ f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
+ All conversions to i64 are expanded to a libcall.
+ */
+static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                              const SPUTargetLowering &TLI) {
+  EVT OpVT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+  EVT Op0VT = Op0.getValueType();
+
+  if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
+      || OpVT == MVT::i64) {
+    // Convert f32 / f64 to i32 / i64 via libcall.
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::FP_TO_SINT)
+             ? RTLIB::getFPTOSINT(Op0VT, OpVT)
+             : RTLIB::getFPTOUINT(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }
+
+  return Op;
+}
+
+//! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
+/*!
+ i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
+ All conversions from i64 are expanded to a libcall.
+ */
+static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                              const SPUTargetLowering &TLI) {
+  EVT OpVT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+  EVT Op0VT = Op0.getValueType();
+
+  if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
+      || Op0VT == MVT::i64) {
+    // Convert i32, i64 to f64 via libcall:
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::SINT_TO_FP)
+             ? RTLIB::getSINTTOFP(Op0VT, OpVT)
+             : RTLIB::getUINTTOFP(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }
+
+  return Op;
+}
+
+//! Lower ISD::SETCC
+/*!
+ This handles MVT::f64 (double floating point) condition lowering
+ */
+static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
+  DebugLoc dl = Op.getDebugLoc();
+  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
+
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  EVT lhsVT = lhs.getValueType();
+  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
+
+  EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
+  APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+  EVT IntVT(MVT::i64);
+
+  // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
+  // selected to a NOP:
+  SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
+  SDValue lhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+                      DAG.getNode(ISD::SRL, dl, IntVT,
+                                  i64lhs, DAG.getConstant(32, MVT::i32)));
+  SDValue lhsHi32abs =
+          DAG.getNode(ISD::AND, dl, MVT::i32,
+                      lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
+  SDValue lhsLo32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
+
+  // SETO and SETUO only use the lhs operand:
+  if (CC->get() == ISD::SETO) {
+    // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
+    // SETUO
+    APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+    return DAG.getNode(ISD::XOR, dl, ccResultVT,
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhs, DAG.getConstantFP(0.0, lhsVT),
+                                    ISD::SETUO),
+                       DAG.getConstant(ccResultAllOnes, ccResultVT));
+  } else if (CC->get() == ISD::SETUO) {
+    // Evaluates to true if Op0 is [SQ]NaN
+    return DAG.getNode(ISD::AND, dl, ccResultVT,
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhsHi32abs,
+                                    DAG.getConstant(0x7ff00000, MVT::i32),
+                                    ISD::SETGE),
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhsLo32,
+                                    DAG.getConstant(0, MVT::i32),
+                                    ISD::SETGT));
+  }
+
+  SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
+  SDValue rhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+                      DAG.getNode(ISD::SRL, dl, IntVT,
+                                  i64rhs, DAG.getConstant(32, MVT::i32)));
+
+  // If a value is negative, subtract from the sign magnitude constant:
+  SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
+
+  // Convert the sign-magnitude representation into 2's complement:
+  SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
+                                      lhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
+  SDValue lhsSelect =
+          DAG.getNode(ISD::SELECT, dl, IntVT,
+                      lhsSelectMask, lhsSignMag2TC, i64lhs);
+
+  SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
+                                      rhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
+  SDValue rhsSelect =
+          DAG.getNode(ISD::SELECT, dl, IntVT,
+                      rhsSelectMask, rhsSignMag2TC, i64rhs);
+
+  unsigned compareOp;
+
+  switch (CC->get()) {
+  case ISD::SETOEQ:
+  case ISD::SETUEQ:
+    compareOp = ISD::SETEQ; break;
+  case ISD::SETOGT:
+  case ISD::SETUGT:
+    compareOp = ISD::SETGT; break;
+  case ISD::SETOGE:
+  case ISD::SETUGE:
+    compareOp = ISD::SETGE; break;
+  case ISD::SETOLT:
+  case ISD::SETULT:
+    compareOp = ISD::SETLT; break;
+  case ISD::SETOLE:
+  case ISD::SETULE:
+    compareOp = ISD::SETLE; break;
+  case ISD::SETUNE:
+  case ISD::SETONE:
+    compareOp = ISD::SETNE; break;
+  default:
+    report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
+  }
+
+  SDValue result =
+          DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
+                       (ISD::CondCode) compareOp);
+
+  if ((CC->get() & 0x8) == 0) {
+    // Ordered comparison:
+    SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
+                                  lhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
+                                  rhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
+
+    result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
+  }
+
+  return result;
+}
+
+//! Lower ISD::SELECT_CC
+/*!
+  ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
+  SELB instruction.
+
+  \note Need to revisit this in the future: if the code path through the true
+  and false value computations is longer than the latency of a branch (6
+  cycles), then it would be more advantageous to branch and insert a new basic
+  block and branch on the condition. However, this code does not make that
+  assumption, given the simplisitc uses so far.
+ */
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const TargetLowering &TLI) {
+  EVT VT = Op.getValueType();
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  SDValue trueval = Op.getOperand(2);
+  SDValue falseval = Op.getOperand(3);
+  SDValue condition = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // NOTE: SELB's arguments: $rA, $rB, $mask
+  //
+  // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
+  // where bits in $mask are 1. CCond will be inverted, having 1s where the
+  // condition was true and 0s where the condition was false. Hence, the
+  // arguments to SELB get reversed.
+
+  // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
+  // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
+  // with another "cannot select select_cc" assert:
+
+  SDValue compare = DAG.getNode(ISD::SETCC, dl,
+                                TLI.getSetCCResultType(Op.getValueType()),
+                                lhs, rhs, condition);
+  return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
+}
+
+//! Custom lower ISD::TRUNCATE
+static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
+{
+  // Type to truncate to
+  EVT VT = Op.getValueType();
+  MVT simpleVT = VT.getSimpleVT();
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
+                               VT, (128 / VT.getSizeInBits()));
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Type to truncate from
+  SDValue Op0 = Op.getOperand(0);
+  EVT Op0VT = Op0.getValueType();
+
+  if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
+    // Create shuffle mask, least significant doubleword of quadword
+    unsigned maskHigh = 0x08090a0b;
+    unsigned maskLow = 0x0c0d0e0f;
+    // Use a shuffle to perform the truncation
+    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32),
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32));
+
+    SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
+                                       Op0, Op0, shufMask);
+
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
+  }
+
+  return SDValue();             // Leave the truncate unmolested
+}
+
+/*!
+ * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
+ * algorithm is to duplicate the sign bit using rotmai to generate at
+ * least one byte full of sign bits. Then propagate the "sign-byte" into
+ * the leftmost words and the i64/i32 into the rightmost words using shufb.
+ *
+ * @param Op The sext operand
+ * @param DAG The current DAG
+ * @return The SDValue with the entire instruction sequence
+ */
+static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
+{
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Type to extend to
+  MVT OpVT = Op.getValueType().getSimpleVT();
+
+  // Type to extend from
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType().getSimpleVT();
+
+  // extend i8 & i16 via i32
+  if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
+    Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
+    Op0VT = MVT::i32;
+  }
+
+  // The type to extend to needs to be a i128 and
+  // the type to extend from needs to be i64 or i32.
+  assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
+          "LowerSIGN_EXTEND: input and/or output operand have wrong size");
+  (void)OpVT;
+
+  // Create shuffle mask
+  unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
+  unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
+  unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
+  SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 DAG.getConstant(mask1, MVT::i32),
+                                 DAG.getConstant(mask1, MVT::i32),
+                                 DAG.getConstant(mask2, MVT::i32),
+                                 DAG.getConstant(mask3, MVT::i32));
+
+  // Word wise arithmetic right shift to generate at least one byte
+  // that contains sign bits.
+  MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
+  SDValue sraVal = DAG.getNode(ISD::SRA,
+                 dl,
+                 mvt,
+                 DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
+                 DAG.getConstant(31, MVT::i32));
+
+  // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
+  SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                        dl, Op0VT, Op0,
+                                        DAG.getTargetConstant(
+                                                  SPU::GPRCRegClass.getID(),
+                                                  MVT::i32)), 0);
+  // Shuffle bytes - Copy the sign bits into the upper 64 bits
+  // and the input value into the lower 64 bits.
+  SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
+        extended, sraVal, shufMask);
+  return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
+}
+
+//! Custom (target-specific) lowering entry point
+/*!
+  This is where LLVM's DAG selection process calls to do target-specific
+  lowering of nodes.
+ */
+SDValue
+SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
+{
+  unsigned Opc = (unsigned) Op.getOpcode();
+  EVT VT = Op.getValueType();
+
+  switch (Opc) {
+  default: {
+#ifndef NDEBUG
+    errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
+    errs() << "Op.getOpcode() = " << Opc << "\n";
+    errs() << "*Op.getNode():\n";
+    Op.getNode()->dump();
+#endif
+    llvm_unreachable(0);
+  }
+  case ISD::LOAD:
+  case ISD::EXTLOAD:
+  case ISD::SEXTLOAD:
+  case ISD::ZEXTLOAD:
+    return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::ConstantFP:
+    return LowerConstantFP(Op, DAG);
+
+  // i8, i64 math ops:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::ROTR:
+  case ISD::ROTL:
+  case ISD::SRL:
+  case ISD::SHL:
+  case ISD::SRA: {
+    if (VT == MVT::i8)
+      return LowerI8Math(Op, DAG, Opc, *this);
+    break;
+  }
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG, *this);
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG, *this);
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:
+    return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+
+  // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return LowerByteImmed(Op, DAG);
+
+  // Vector and i8 multiply:
+  case ISD::MUL:
+    if (VT == MVT::i8)
+      return LowerI8Math(Op, DAG, Opc, *this);
+
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG, *this);
+
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG, *this);
+
+  case ISD::TRUNCATE:
+    return LowerTRUNCATE(Op, DAG);
+
+  case ISD::SIGN_EXTEND:
+    return LowerSIGN_EXTEND(Op, DAG);
+  }
+
+  return SDValue();
+}
+
+void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const
+{
+#if 0
+  unsigned Opc = (unsigned) N->getOpcode();
+  EVT OpVT = N->getValueType(0);
+
+  switch (Opc) {
+  default: {
+    errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
+    errs() << "Op.getOpcode() = " << Opc << "\n";
+    errs() << "*Op.getNode():\n";
+    N->dump();
+    abort();
+    /*NOTREACHED*/
+  }
+  }
+#endif
+
+  /* Otherwise, return unchanged */
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue
+SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
+{
+#if 0
+  TargetMachine &TM = getTargetMachine();
+#endif
+  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op0 = N->getOperand(0);       // everything has at least one operand
+  EVT NodeVT = N->getValueType(0);      // The node's value type
+  EVT Op0VT = Op0.getValueType();       // The first operand's result
+  SDValue Result;                       // Initially, empty result
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ADD: {
+    SDValue Op1 = N->getOperand(1);
+
+    if (Op0.getOpcode() == SPUISD::IndirectAddr
+        || Op1.getOpcode() == SPUISD::IndirectAddr) {
+      // Normalize the operands to reduce repeated code
+      SDValue IndirectArg = Op0, AddArg = Op1;
+
+      if (Op1.getOpcode() == SPUISD::IndirectAddr) {
+        IndirectArg = Op1;
+        AddArg = Op0;
+      }
+
+      if (isa<ConstantSDNode>(AddArg)) {
+        ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
+        SDValue IndOp1 = IndirectArg.getOperand(1);
+
+        if (CN0->isNullValue()) {
+          // (add (SPUindirect <arg>, <arg>), 0) ->
+          // (SPUindirect <arg>, <arg>)
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            errs() << "\n"
+                 << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return IndirectArg;
+        } else if (isa<ConstantSDNode>(IndOp1)) {
+          // (add (SPUindirect <arg>, <const>), <const>) ->
+          // (SPUindirect <arg>, <const + const>)
+          ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
+          int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
+          SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            errs() << "\n"
+                 << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
+                 << "), " << CN0->getSExtValue() << ")\n"
+                 << "With:    (SPUindirect <arg>, "
+                 << combinedConst << ")\n";
+          }
+#endif
+
+          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
+                             IndirectArg, combinedValue);
+        }
+      }
+    }
+    break;
+  }
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: {
+    if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
+      // (any_extend (SPUextract_elt0 <arg>)) ->
+      // (SPUextract_elt0 <arg>)
+      // Types must match, however...
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        errs() << "\nReplace: ";
+        N->dump(&DAG);
+        errs() << "\nWith:    ";
+        Op0.getNode()->dump(&DAG);
+        errs() << "\n";
+      }
+#endif
+
+      return Op0;
+    }
+    break;
+  }
+  case SPUISD::IndirectAddr: {
+    if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+      if (CN != 0 && CN->isNullValue()) {
+        // (SPUindirect (SPUaform <addr>, 0), 0) ->
+        // (SPUaform <addr>, 0)
+
+        DEBUG(errs() << "Replace: ");
+        DEBUG(N->dump(&DAG));
+        DEBUG(errs() << "\nWith:    ");
+        DEBUG(Op0.getNode()->dump(&DAG));
+        DEBUG(errs() << "\n");
+
+        return Op0;
+      }
+    } else if (Op0.getOpcode() == ISD::ADD) {
+      SDValue Op1 = N->getOperand(1);
+      if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
+        // (SPUindirect (add <arg>, <arg>), 0) ->
+        // (SPUindirect <arg>, <arg>)
+        if (CN1->isNullValue()) {
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            errs() << "\n"
+                 << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
+                             Op0.getOperand(0), Op0.getOperand(1));
+        }
+      }
+    }
+    break;
+  }
+  case SPUISD::SHL_BITS:
+  case SPUISD::SHL_BYTES:
+  case SPUISD::ROTBYTES_LEFT: {
+    SDValue Op1 = N->getOperand(1);
+
+    // Kill degenerate vector shifts:
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
+      if (CN->isNullValue()) {
+        Result = Op0;
+      }
+    }
+    break;
+  }
+  case SPUISD::PREFSLOT2VEC: {
+    switch (Op0.getOpcode()) {
+    default:
+      break;
+    case ISD::ANY_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::SIGN_EXTEND: {
+      // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
+      // <arg>
+      // but only if the SPUprefslot2vec and <arg> types match.
+      SDValue Op00 = Op0.getOperand(0);
+      if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
+        SDValue Op000 = Op00.getOperand(0);
+        if (Op000.getValueType() == NodeVT) {
+          Result = Op000;
+        }
+      }
+      break;
+    }
+    case SPUISD::VEC2PREFSLOT: {
+      // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
+      // <arg>
+      Result = Op0.getOperand(0);
+      break;
+    }
+    }
+    break;
+  }
+  }
+
+  // Otherwise, return unchanged.
+#ifndef NDEBUG
+  if (Result.getNode()) {
+    DEBUG(errs() << "\nReplace.SPU: ");
+    DEBUG(N->dump(&DAG));
+    DEBUG(errs() << "\nWith:        ");
+    DEBUG(Result.getNode()->dump(&DAG));
+    DEBUG(errs() << "\n");
+  }
+#endif
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SPUTargetLowering::ConstraintType
+SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
+  if (ConstraintLetter.size() == 1) {
+    switch (ConstraintLetter[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(ConstraintLetter);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+SPUTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+    //FIXME: Seems like the supported constraint letters were just copied
+    // from PPC, as the following doesn't correspond to the GCC docs.
+    // I'm leaving it so until someone adds the corresponding lowering support.
+  case 'b':
+  case 'r':
+  case 'f':
+  case 'd':
+  case 'v':
+  case 'y':
+    weight = CW_Register;
+    break;
+  }
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                EVT VT) const
+{
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+    case 'r':   // R0-R31
+      if (VT == MVT::i64)
+        return std::make_pair(0U, SPU::R64CRegisterClass);
+      return std::make_pair(0U, SPU::R32CRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, SPU::R32FPRegisterClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, SPU::R64FPRegisterClass);
+      break;
+    case 'v':
+      return std::make_pair(0U, SPU::GPRCRegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+//! Compute used/known bits for a SPU operand
+void
+SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                  APInt &KnownZero,
+                                                  APInt &KnownOne,
+                                                  const SelectionDAG &DAG,
+                                                  unsigned Depth ) const {
+#if 0
+  const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
+
+  switch (Op.getOpcode()) {
+  default:
+    // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+    break;
+  case CALL:
+  case SHUFB:
+  case SHUFFLE_MASK:
+  case CNTB:
+  case SPUISD::PREFSLOT2VEC:
+  case SPUISD::LDRESULT:
+  case SPUISD::VEC2PREFSLOT:
+  case SPUISD::SHLQUAD_L_BITS:
+  case SPUISD::SHLQUAD_L_BYTES:
+  case SPUISD::VEC_ROTL:
+  case SPUISD::VEC_ROTR:
+  case SPUISD::ROTBYTES_LEFT:
+  case SPUISD::SELECT_MASK:
+  case SPUISD::SELB:
+  }
+#endif
+}
+
+unsigned
+SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    return 1;
+
+  case ISD::SETCC: {
+    EVT VT = Op.getValueType();
+
+    if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
+      VT = MVT::i32;
+    }
+    return VT.getSizeInBits();
+  }
+  }
+}
+
+// LowerAsmOperandForConstraint
+void
+SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                std::string &Constraint,
+                                                std::vector<SDValue> &Ops,
+                                                SelectionDAG &DAG) const {
+  // Default, for the time being, to the base class handler
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode.
+bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
+                                                Type *Ty) const {
+  // SPU's addresses are 256K:
+  return (V > -(1 << 18) && V < (1 << 18) - 1);
+}
+
+bool SPUTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
+  return false;
+}
+
+bool
+SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The SPU target isn't yet aware of offsets.
+  return false;
+}
+
+// can we compare to Imm without writing it into a register?
+bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  //ceqi, cgti, etc. all take s10 operand
+  return isInt<10>(Imm);
+}
+
+bool
+SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                         Type * ) const{
+
+  // A-form: 18bit absolute address.
+  if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
+    return true;
+
+  // D-form: reg + 14bit offset
+  if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
+    return true;
+
+  // X-form: reg+reg
+  if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
+    return true;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUISelLowering.h b/contrib/llvm/lib/Target/CellSPU/SPUISelLowering.h
new file mode 100644
index 000000000000..e3db7b2f1fbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUISelLowering.h
@@ -0,0 +1,185 @@
+//===-- SPUISelLowering.h - Cell SPU DAG Lowering Interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Cell SPU uses to lower LLVM code into
+// a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_ISELLOWERING_H
+#define SPU_ISELLOWERING_H
+
+#include "SPU.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+namespace llvm {
+  namespace SPUISD {
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Pseudo instructions:
+      RET_FLAG,                 ///< Return with flag, matched by bi instruction
+
+      Hi,                       ///< High address component (upper 16)
+      Lo,                       ///< Low address component (lower 16)
+      PCRelAddr,                ///< Program counter relative address
+      AFormAddr,                ///< A-form address (local store)
+      IndirectAddr,             ///< D-Form "imm($r)" and X-form "$r($r)"
+
+      LDRESULT,                 ///< Load result (value, chain)
+      CALL,                     ///< CALL instruction
+      SHUFB,                    ///< Vector shuffle (permute)
+      SHUFFLE_MASK,             ///< Shuffle mask
+      CNTB,                     ///< Count leading ones in bytes
+      PREFSLOT2VEC,             ///< Promote scalar->vector
+      VEC2PREFSLOT,             ///< Extract element 0
+      SHL_BITS,                 ///< Shift quad left, by bits
+      SHL_BYTES,                ///< Shift quad left, by bytes
+      SRL_BYTES,                ///< Shift quad right, by bytes. Insert zeros.
+      VEC_ROTL,                 ///< Vector rotate left
+      VEC_ROTR,                 ///< Vector rotate right
+      ROTBYTES_LEFT,            ///< Rotate bytes (loads -> ROTQBYI)
+      ROTBYTES_LEFT_BITS,       ///< Rotate bytes left by bit shift count
+      SELECT_MASK,              ///< Select Mask (FSM, FSMB, FSMH, FSMBI)
+      SELB,                     ///< Select bits -> (b & mask) | (a & ~mask)
+      // Markers: These aren't used to generate target-dependent nodes, but
+      // are used during instruction selection.
+      ADD64_MARKER,             ///< i64 addition marker
+      SUB64_MARKER,             ///< i64 subtraction marker
+      MUL64_MARKER,             ///< i64 multiply marker
+      LAST_SPUISD               ///< Last user-defined instruction
+    };
+  }
+
+  //! Utility functions specific to CellSPU:
+  namespace SPU {
+    SDValue get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
+                             EVT ValueType);
+    SDValue get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
+                             EVT ValueType);
+    SDValue get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
+                             EVT ValueType);
+    SDValue get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
+                            EVT ValueType);
+    SDValue get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
+                              EVT ValueType);
+    SDValue get_v4i32_imm(SDNode *N, SelectionDAG &DAG);
+    SDValue get_v2i64_imm(SDNode *N, SelectionDAG &DAG);
+
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG,
+                              const SPUTargetMachine &TM);
+    //! Simplify a EVT::v2i64 constant splat to CellSPU-ready form
+    SDValue LowerV2I64Splat(EVT OpVT, SelectionDAG &DAG, uint64_t splat,
+                             DebugLoc dl);
+  }
+
+  class SPUTargetMachine;            // forward dec'l.
+
+  class SPUTargetLowering :
+    public TargetLowering
+  {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    SPUTargetMachine &SPUTM;
+
+  public:
+    //! The venerable constructor
+    /*!
+     This is where the CellSPU backend sets operation handling (i.e., legal,
+     custom, expand or promote.)
+     */
+    SPUTargetLowering(SPUTargetMachine &TM);
+
+    //! Get the target machine
+    SPUTargetMachine &getSPUTargetMachine() {
+      return SPUTM;
+    }
+
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - Return the ValueType for ISD::SETCC
+    virtual EVT getSetCCResultType(EVT VT) const;
+
+    virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+
+    //! Custom lowering hooks
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    //! Custom lowering hook for nodes with illegal result types.
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG) const;
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth = 0) const;
+
+    ConstraintType getConstraintType(const std::string &ConstraintLetter) const;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   EVT VT) const;
+
+    void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const;
+
+    /// isLegalAddressImmediate - Return true if the integer value can be used
+    /// as the offset of the target addressing mode.
+    virtual bool isLegalAddressImmediate(int64_t V, Type *Ty) const;
+    virtual bool isLegalAddressImmediate(GlobalValue *) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool doesNotRet, bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl, SelectionDAG &DAG) const;
+
+    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+
+    virtual bool isLegalAddressingMode(const AddrMode &AM,
+                                       Type *Ty) const;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUInstrBuilder.h b/contrib/llvm/lib/Target/CellSPU/SPUInstrBuilder.h
new file mode 100644
index 000000000000..b495537fc2c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUInstrBuilder.h
@@ -0,0 +1,43 @@
+//===-- SPUInstrBuilder.h - Aides for building Cell SPU insts ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_INSTRBUILDER_H
+#define SPU_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUInstrFormats.td b/contrib/llvm/lib/Target/CellSPU/SPUInstrFormats.td
new file mode 100644
index 000000000000..cd3f42214345
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUInstrFormats.td
@@ -0,0 +1,320 @@
+//===-- SPUInstrFormats.td - Cell SPU Instruction Formats --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Cell SPU instruction formats. Note that these are notationally similar to
+// PowerPC, like "A-Form". But the sizes of operands and fields differ.
+
+// This was kiped from the PPC instruction formats (seemed like a good idea...)
+
+class SPUInstr<dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "SPU";
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+}
+
+// RR Format
+class RRForm<bits<11> opcode, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : SPUInstr<OOL, IOL, asmstr, itin> {
+  bits<7> RA;
+  bits<7> RB;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = opcode;
+  let Inst{11-17} = RB;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+let RB = 0 in {
+  // RR Format, where RB is zeroed (dont care):
+  class RRForm_1<bits<11> opcode, dag OOL, dag IOL, string asmstr, 
+                 InstrItinClass itin, list<dag> pattern>
+           : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+
+  let RA = 0 in {
+    // RR Format, where RA and RB are zeroed (dont care):
+    // Used for reads from status control registers (see FPSCRRr32)
+    class RRForm_2<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+             : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+    { }
+  }
+}
+
+let RT = 0 in {
+  // RR Format, where RT is zeroed (don't care), or as the instruction handbook
+  // says, "RT is a false target." Used in "Halt if" instructions
+  class RRForm_3<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+      : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RRR Format
+class RRRForm<bits<4> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<7> RA;
+  bits<7> RB;
+  bits<7> RC;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-3} = opcode;
+  let Inst{4-10} = RT;
+  let Inst{11-17} = RB;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RC;
+}
+
+// RI7 Format
+class RI7Form<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<7> i7;
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = opcode;
+  let Inst{11-17} = i7;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+// CVTIntFp Format
+class CVTIntFPForm<bits<10> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-9} = opcode;
+  let Inst{10-17} = 0;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+let RA = 0 in {
+  class BICondForm<bits<11> opcode, dag OOL, dag IOL, string asmstr, list<dag> pattern>
+           : RRForm<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+
+  let RT = 0 in {
+    // Branch instruction format (without D/E flag settings)
+    class BRForm<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+          : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+    { }
+
+    class BIForm<bits<11> opcode, string asmstr, list<dag> pattern>
+             : RRForm<opcode, (outs), (ins R32C:$func), asmstr, BranchResolv,
+                      pattern>
+    { }
+
+    let RB = 0 in {
+      // Return instruction (bi, branch indirect), RA is zero (LR):
+      class RETForm<string asmstr, list<dag> pattern>
+             : BRForm<0b00010101100, (outs), (ins), asmstr, BranchResolv,
+                      pattern>
+      { }
+    }
+  }
+}
+
+// Branch indirect external data forms:
+class BISLEDForm<bits<2> DE_flag, string asmstr, list<dag> pattern>
+         : SPUInstr<(outs), (ins indcalltarget:$func), asmstr, BranchResolv>
+{
+  bits<7> Rcalldest;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = 0b11010101100;
+  let Inst{11} = 0;
+  let Inst{12-13} = DE_flag;
+  let Inst{14-17} = 0b0000;
+  let Inst{18-24} = Rcalldest;
+  let Inst{25-31} = 0b0000000;
+}
+
+// RI10 Format
+class RI10Form<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<10> i10;
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-7} = opcode;
+  let Inst{8-17} = i10;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+// RI10 Format, where the constant is zero (or effectively ignored by the
+// SPU)
+let i10 = 0 in {
+  class RI10Form_1<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+          : RI10Form<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RI10 Format, where RT is ignored.
+// This format is used primarily by the Halt If ... Immediate set of
+// instructions
+let RT = 0 in {
+  class RI10Form_2<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+        : RI10Form<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RI16 Format
+class RI16Form<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<16> i16;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-8} = opcode;
+  let Inst{9-24} = i16;
+  let Inst{25-31} = RT;
+}
+
+// Specialized version of the RI16 Format for unconditional branch relative and
+// branch absolute, branch and set link. Note that for branch and set link, the
+// link register doesn't have to be $lr, but this is actually hard coded into
+// the instruction pattern.
+
+let RT = 0 in {
+  class UncondBranch<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+                     list<dag> pattern>
+    : RI16Form<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+
+  class BranchSetLink<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+                      list<dag> pattern>
+        : RI16Form<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+}
+
+//===----------------------------------------------------------------------===//
+// Specialized versions of RI16:
+//===----------------------------------------------------------------------===//
+
+// RI18 Format
+class RI18Form<bits<7> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<18> i18;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-6} = opcode;
+  let Inst{7-24} = i18;
+  let Inst{25-31} = RT;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats for intrinsics:
+//===----------------------------------------------------------------------===//
+
+// RI10 Format for v8i16 intrinsics
+class RI10_Int_v8i16<bits<8> opcode, string opc, InstrItinClass itin,
+                     Intrinsic IntID> :
+  RI10Form<opcode, (outs VECREG:$rT), (ins s10imm:$val, VECREG:$rA),
+           !strconcat(opc, " $rT, $rA, $val"), itin,
+           [(set (v8i16 VECREG:$rT), (IntID (v8i16 VECREG:$rA),
+                                            i16ImmSExt10:$val))] >;
+
+class RI10_Int_v4i32<bits<8> opcode, string opc, InstrItinClass itin,
+                     Intrinsic IntID> :
+  RI10Form<opcode, (outs VECREG:$rT), (ins s10imm:$val, VECREG:$rA),
+           !strconcat(opc, " $rT, $rA, $val"), itin,
+           [(set (v4i32 VECREG:$rT), (IntID (v4i32 VECREG:$rA),
+                                            i32ImmSExt10:$val))] >;
+
+// RR Format for v8i16 intrinsics
+class RR_Int_v8i16<bits<11> opcode, string opc, InstrItinClass itin,
+                   Intrinsic IntID> :
+  RRForm<opcode, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+         !strconcat(opc, " $rT, $rA, $rB"), itin,
+         [(set (v8i16 VECREG:$rT), (IntID (v8i16 VECREG:$rA),
+                                          (v8i16 VECREG:$rB)))] >;
+
+// RR Format for v4i32 intrinsics
+class RR_Int_v4i32<bits<11> opcode, string opc, InstrItinClass itin,
+                   Intrinsic IntID> :
+  RRForm<opcode, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+         !strconcat(opc, " $rT, $rA, $rB"), itin,
+         [(set (v4i32 VECREG:$rT), (IntID (v4i32 VECREG:$rA),
+                                          (v4i32 VECREG:$rB)))] >;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions, like call frames:
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : SPUInstr<OOL, IOL, asmstr, NoItinerary> {
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch hint formats
+//===----------------------------------------------------------------------===//
+// For hbrr and hbra
+class HBI16Form<bits<7> opcode, dag IOL, string asmstr>
+        : Instruction {
+  field bits<32> Inst;
+  bits<16>i16;
+  bits<9>RO;
+
+  let Namespace = "SPU";
+  let InOperandList = IOL;
+  let OutOperandList = (outs); //no output
+  let AsmString = asmstr;
+  let Itinerary = BranchHints;
+
+  let Inst{0-6} = opcode;
+  let Inst{7-8} = RO{8-7};
+  let Inst{9-24} = i16;
+  let Inst{25-31} = RO{6-0};
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUInstrInfo.cpp b/contrib/llvm/lib/Target/CellSPU/SPUInstrInfo.cpp
new file mode 100644
index 000000000000..759923d7bb42
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -0,0 +1,453 @@
+//===-- SPUInstrInfo.cpp - Cell SPU Instruction Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Cell SPU implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUInstrInfo.h"
+#include "SPUInstrBuilder.h"
+#include "SPUTargetMachine.h"
+#include "SPUHazardRecognizers.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define GET_INSTRINFO_CTOR
+#include "SPUGenInstrInfo.inc"
+
+using namespace llvm;
+
+namespace {
+  //! Predicate for an unconditional branch instruction
+  inline bool isUncondBranch(const MachineInstr *I) {
+    unsigned opc = I->getOpcode();
+
+    return (opc == SPU::BR
+            || opc == SPU::BRA
+            || opc == SPU::BI);
+  }
+
+  //! Predicate for a conditional branch instruction
+  inline bool isCondBranch(const MachineInstr *I) {
+    unsigned opc = I->getOpcode();
+
+    return (opc == SPU::BRNZr32
+            || opc == SPU::BRNZv4i32
+            || opc == SPU::BRZr32
+            || opc == SPU::BRZv4i32
+            || opc == SPU::BRHNZr16
+            || opc == SPU::BRHNZv8i16
+            || opc == SPU::BRHZr16
+            || opc == SPU::BRHZv8i16);
+  }
+}
+
+SPUInstrInfo::SPUInstrInfo(SPUTargetMachine &tm)
+  : SPUGenInstrInfo(SPU::ADJCALLSTACKDOWN, SPU::ADJCALLSTACKUP),
+    TM(tm),
+    RI(*TM.getSubtargetImpl(), *this)
+{ /* NOP */ }
+
+/// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+/// this target when scheduling the DAG.
+ScheduleHazardRecognizer *SPUInstrInfo::CreateTargetHazardRecognizer(
+  const TargetMachine *TM,
+  const ScheduleDAG *DAG) const {
+  const TargetInstrInfo *TII = TM->getInstrInfo();
+  assert(TII && "No InstrInfo?");
+  return new SPUHazardRecognizer(*TII);
+}
+
+unsigned
+SPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                  int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SPU::LQDv16i8:
+  case SPU::LQDv8i16:
+  case SPU::LQDv4i32:
+  case SPU::LQDv4f32:
+  case SPU::LQDv2f64:
+  case SPU::LQDr128:
+  case SPU::LQDr64:
+  case SPU::LQDr32:
+  case SPU::LQDr16: {
+    const MachineOperand MOp1 = MI->getOperand(1);
+    const MachineOperand MOp2 = MI->getOperand(2);
+    if (MOp1.isImm() && MOp2.isFI()) {
+      FrameIndex = MOp2.getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  }
+  return 0;
+}
+
+unsigned
+SPUInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                 int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SPU::STQDv16i8:
+  case SPU::STQDv8i16:
+  case SPU::STQDv4i32:
+  case SPU::STQDv4f32:
+  case SPU::STQDv2f64:
+  case SPU::STQDr128:
+  case SPU::STQDr64:
+  case SPU::STQDr32:
+  case SPU::STQDr16:
+  case SPU::STQDr8: {
+    const MachineOperand MOp1 = MI->getOperand(1);
+    const MachineOperand MOp2 = MI->getOperand(2);
+    if (MOp1.isImm() && MOp2.isFI()) {
+      FrameIndex = MOp2.getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  }
+  return 0;
+}
+
+void SPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const
+{
+  // We support cross register class moves for our aliases, such as R3 in any
+  // reg class to any other reg class containing R3.  This is required because
+  // we instruction select bitconvert i64 -> f64 as a noop for example, so our
+  // types have no specific meaning.
+
+  BuildMI(MBB, I, DL, get(SPU::LRr128), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void
+SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned SrcReg, bool isKill, int FrameIdx,
+                                  const TargetRegisterClass *RC,
+                                  const TargetRegisterInfo *TRI) const
+{
+  unsigned opc;
+  bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset());
+  if (RC == SPU::GPRCRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128);
+  } else if (RC == SPU::R64CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
+  } else if (RC == SPU::R64FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
+  } else if (RC == SPU::R32CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
+  } else if (RC == SPU::R32FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
+  } else if (RC == SPU::R16CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16);
+  } else if (RC == SPU::R8CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8);
+  } else if (RC == SPU::VECREGRegisterClass) {
+    opc = (isValidFrameIdx) ? SPU::STQDv16i8 : SPU::STQXv16i8;
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  addFrameReference(BuildMI(MBB, MI, DL, get(opc))
+                    .addReg(SrcReg, getKillRegState(isKill)), FrameIdx);
+}
+
+void
+SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const
+{
+  unsigned opc;
+  bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset());
+  if (RC == SPU::GPRCRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128);
+  } else if (RC == SPU::R64CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
+  } else if (RC == SPU::R64FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
+  } else if (RC == SPU::R32CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
+  } else if (RC == SPU::R32FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
+  } else if (RC == SPU::R16CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16);
+  } else if (RC == SPU::R8CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8);
+  } else if (RC == SPU::VECREGRegisterClass) {
+    opc = (isValidFrameIdx) ? SPU::LQDv16i8 : SPU::LQXv16i8;
+  } else {
+    llvm_unreachable("Unknown regclass in loadRegFromStackSlot!");
+  }
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  addFrameReference(BuildMI(MBB, MI, DL, get(opc), DestReg), FrameIdx);
+}
+
+//! Branch analysis
+/*!
+  \note This code was kiped from PPC. There may be more branch analysis for
+  CellSPU than what's currently done here.
+ */
+bool
+SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                            MachineBasicBlock *&FBB,
+                            SmallVectorImpl<MachineOperand> &Cond,
+                            bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranch(LastInst)) {
+      // Check for jump tables
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (isCondBranch(LastInst)) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      DEBUG(errs() << "Pushing LastInst:               ");
+      DEBUG(LastInst->dump());
+      Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a conditional and unconditional branch, handle it.
+  if (isCondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    DEBUG(errs() << "Pushing SecondLastInst:         ");
+    DEBUG(SecondLastInst->dump());
+    Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+// search MBB for branch hint labels and branch hit ops
+static void removeHBR( MachineBasicBlock &MBB) {
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I){
+    if (I->getOpcode() == SPU::HBRA ||
+        I->getOpcode() == SPU::HBR_LABEL){
+      I=MBB.erase(I);
+      if (I == MBB.end())
+        break;
+    }
+  }
+}
+
+unsigned
+SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  removeHBR(MBB);
+  if (I == MBB.begin())
+    return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (!isCondBranch(I) && !isUncondBranch(I))
+    return 0;
+
+  // Remove the first branch.
+  DEBUG(errs() << "Removing branch:                ");
+  DEBUG(I->dump());
+  I->eraseFromParent();
+  I = MBB.end();
+  if (I == MBB.begin())
+    return 1;
+
+  --I;
+  if (!(isCondBranch(I) || isUncondBranch(I)))
+    return 1;
+
+  // Remove the second branch.
+  DEBUG(errs() << "Removing second branch:         ");
+  DEBUG(I->dump());
+  I->eraseFromParent();
+  return 2;
+}
+
+/** Find the optimal position for a hint branch instruction in a basic block.
+ * This should take into account:
+ *   -the branch hint delays
+ *   -congestion of the memory bus
+ *   -dual-issue scheduling (i.e. avoid insertion of nops)
+ * Current implementation is rather simplistic.
+ */
+static MachineBasicBlock::iterator findHBRPosition(MachineBasicBlock &MBB)
+{
+   MachineBasicBlock::iterator J = MBB.end();
+	for( int i=0; i<8; i++) {
+		if( J == MBB.begin() ) return J;
+		J--;
+	}
+	return J;
+}
+
+unsigned
+SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond,
+                           DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "SPU branch conditions have two components!");
+
+  MachineInstrBuilder MIB;
+  //TODO: make a more accurate algorithm.
+  bool haveHBR = MBB.size()>8;
+  
+  removeHBR(MBB);
+  MCSymbol *branchLabel = MBB.getParent()->getContext().CreateTempSymbol();
+  // Add a label just before the branch
+  if (haveHBR)
+    MIB = BuildMI(&MBB, DL, get(SPU::HBR_LABEL)).addSym(branchLabel);
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty()) {
+      // Unconditional branch
+      MIB = BuildMI(&MBB, DL, get(SPU::BR));
+      MIB.addMBB(TBB);
+
+      DEBUG(errs() << "Inserted one-way uncond branch: ");
+      DEBUG((*MIB).dump());
+
+      // basic blocks have just one branch so it is safe to add the hint a its
+      if (haveHBR) {
+        MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
+        MIB.addSym(branchLabel);
+        MIB.addMBB(TBB);
+      }	
+    } else {
+      // Conditional branch
+      MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
+      MIB.addReg(Cond[1].getReg()).addMBB(TBB);
+
+      if (haveHBR) {
+        MIB = BuildMI(MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
+        MIB.addSym(branchLabel);
+        MIB.addMBB(TBB);
+      }	
+
+      DEBUG(errs() << "Inserted one-way cond branch:   ");
+      DEBUG((*MIB).dump());
+    }
+    return 1;
+  } else {
+    MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
+    MachineInstrBuilder MIB2 = BuildMI(&MBB, DL, get(SPU::BR));
+
+    // Two-way Conditional Branch.
+    MIB.addReg(Cond[1].getReg()).addMBB(TBB);
+    MIB2.addMBB(FBB);
+
+    if (haveHBR) {
+      MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
+      MIB.addSym(branchLabel);
+      MIB.addMBB(FBB);
+    }	
+
+    DEBUG(errs() << "Inserted conditional branch:    ");
+    DEBUG((*MIB).dump());
+    DEBUG(errs() << "part 2: ");
+    DEBUG((*MIB2).dump());
+   return 2;
+  }
+}
+
+//! Reverses a branch's condition, returning false on success.
+bool
+SPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+  const {
+  // Pretty brainless way of inverting the condition, but it works, considering
+  // there are only two conditions...
+  static struct {
+    unsigned Opc;               //! The incoming opcode
+    unsigned RevCondOpc;        //! The reversed condition opcode
+  } revconds[] = {
+    { SPU::BRNZr32, SPU::BRZr32 },
+    { SPU::BRNZv4i32, SPU::BRZv4i32 },
+    { SPU::BRZr32, SPU::BRNZr32 },
+    { SPU::BRZv4i32, SPU::BRNZv4i32 },
+    { SPU::BRHNZr16, SPU::BRHZr16 },
+    { SPU::BRHNZv8i16, SPU::BRHZv8i16 },
+    { SPU::BRHZr16, SPU::BRHNZr16 },
+    { SPU::BRHZv8i16, SPU::BRHNZv8i16 }
+  };
+
+  unsigned Opc = unsigned(Cond[0].getImm());
+  // Pretty dull mapping between the two conditions that SPU can generate:
+  for (int i = sizeof(revconds)/sizeof(revconds[0]) - 1; i >= 0; --i) {
+    if (revconds[i].Opc == Opc) {
+      Cond[0].setImm(revconds[i].RevCondOpc);
+      return false;
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUInstrInfo.h b/contrib/llvm/lib/Target/CellSPU/SPUInstrInfo.h
new file mode 100644
index 000000000000..85e5821aefa1
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUInstrInfo.h
@@ -0,0 +1,84 @@
+//===-- SPUInstrInfo.h - Cell SPU Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the CellSPU implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_INSTRUCTIONINFO_H
+#define SPU_INSTRUCTIONINFO_H
+
+#include "SPU.h"
+#include "SPURegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SPUGenInstrInfo.inc"
+
+namespace llvm {
+  //! Cell SPU instruction information class
+  class SPUInstrInfo : public SPUGenInstrInfo {
+    SPUTargetMachine &TM;
+    const SPURegisterInfo RI;
+  public:
+    explicit SPUInstrInfo(SPUTargetMachine &tm);
+
+    /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+    /// such, whenever a client has an instance of instruction info, it should
+    /// always be able to get register info as well (through this method).
+    ///
+    virtual const SPURegisterInfo &getRegisterInfo() const { return RI; }
+
+    ScheduleHazardRecognizer *
+    CreateTargetHazardRecognizer(const TargetMachine *TM,
+                                 const ScheduleDAG *DAG) const;
+
+    unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                 int &FrameIndex) const;
+    unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                int &FrameIndex) const;
+
+    virtual void copyPhysReg(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I, DebugLoc DL,
+                             unsigned DestReg, unsigned SrcReg,
+                             bool KillSrc) const;
+
+    //! Store a register to a stack slot, based on its register class.
+    virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned SrcReg, bool isKill, int FrameIndex,
+                                     const TargetRegisterClass *RC,
+                                     const TargetRegisterInfo *TRI) const;
+
+    //! Load a register from a stack slot, based on its register class.
+    virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      unsigned DestReg, int FrameIndex,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const;
+
+    //! Reverses a branch's condition, returning false on success.
+    virtual
+    bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+    virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                               MachineBasicBlock *&FBB,
+                               SmallVectorImpl<MachineOperand> &Cond,
+                               bool AllowModify) const;
+
+    virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+    virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                  MachineBasicBlock *FBB,
+                                  const SmallVectorImpl<MachineOperand> &Cond,
+                                  DebugLoc DL) const;
+   };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUInstrInfo.td b/contrib/llvm/lib/Target/CellSPU/SPUInstrInfo.td
new file mode 100644
index 000000000000..f76ebd75bfef
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUInstrInfo.td
@@ -0,0 +1,4484 @@
+//==- SPUInstrInfo.td - Describe the Cell SPU Instructions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Cell SPU Instructions:
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO Items (not urgent today, but would be nice, low priority)
+//
+// ANDBI, ORBI: SPU constructs a 4-byte constant for these instructions by
+// concatenating the byte argument b as "bbbb". Could recognize this bit pattern
+// in 16-bit and 32-bit constants and reduce instruction count.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions:
+//===----------------------------------------------------------------------===//
+
+let hasCtrlDep = 1, Defs = [R1], Uses = [R1] in {
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm_i32:$amt),
+                                "${:comment} ADJCALLSTACKDOWN",
+                                [(callseq_start timm:$amt)]>;
+  def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm_i32:$amt),
+                                "${:comment} ADJCALLSTACKUP",
+                                [(callseq_end timm:$amt)]>;
+  def HBR_LABEL        : Pseudo<(outs), (ins hbrtarget:$targ), 
+                                "$targ:\t${:comment}branch hint target",[ ]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Loads:
+// NB: The ordering is actually important, since the instruction selection
+// will try each of the instructions in sequence, i.e., the D-form first with
+// the 10-bit displacement, then the A-form with the 16 bit displacement, and
+// finally the X-form with the register-register.
+//===----------------------------------------------------------------------===//
+
+let canFoldAsLoad = 1 in {
+  class LoadDFormVec<ValueType vectype>
+    : RI10Form<0b00101100, (outs VECREG:$rT), (ins dformaddr:$src),
+               "lqd\t$rT, $src",
+               LoadStore,
+               [(set (vectype VECREG:$rT), (load dform_addr:$src))]>
+  { }
+
+  class LoadDForm<RegisterClass rclass>
+    : RI10Form<0b00101100, (outs rclass:$rT), (ins dformaddr:$src),
+               "lqd\t$rT, $src",
+               LoadStore,
+               [(set rclass:$rT, (load dform_addr:$src))]>
+  { }
+
+  multiclass LoadDForms
+  {
+    def v16i8: LoadDFormVec<v16i8>;
+    def v8i16: LoadDFormVec<v8i16>;
+    def v4i32: LoadDFormVec<v4i32>;
+    def v2i64: LoadDFormVec<v2i64>;
+    def v4f32: LoadDFormVec<v4f32>;
+    def v2f64: LoadDFormVec<v2f64>;
+
+    def r128:  LoadDForm<GPRC>;
+    def r64:   LoadDForm<R64C>;
+    def r32:   LoadDForm<R32C>;
+    def f32:   LoadDForm<R32FP>;
+    def f64:   LoadDForm<R64FP>;
+    def r16:   LoadDForm<R16C>;
+    def r8:    LoadDForm<R8C>;
+  }
+
+  class LoadAFormVec<ValueType vectype>
+    : RI16Form<0b100001100, (outs VECREG:$rT), (ins addr256k:$src),
+               "lqa\t$rT, $src",
+               LoadStore,
+               [(set (vectype VECREG:$rT), (load aform_addr:$src))]>
+  { }
+
+  class LoadAForm<RegisterClass rclass>
+    : RI16Form<0b100001100, (outs rclass:$rT), (ins addr256k:$src),
+               "lqa\t$rT, $src",
+               LoadStore,
+               [(set rclass:$rT, (load aform_addr:$src))]>
+  { }
+
+  multiclass LoadAForms
+  {
+    def v16i8: LoadAFormVec<v16i8>;
+    def v8i16: LoadAFormVec<v8i16>;
+    def v4i32: LoadAFormVec<v4i32>;
+    def v2i64: LoadAFormVec<v2i64>;
+    def v4f32: LoadAFormVec<v4f32>;
+    def v2f64: LoadAFormVec<v2f64>;
+
+    def r128:  LoadAForm<GPRC>;
+    def r64:   LoadAForm<R64C>;
+    def r32:   LoadAForm<R32C>;
+    def f32:   LoadAForm<R32FP>;
+    def f64:   LoadAForm<R64FP>;
+    def r16:   LoadAForm<R16C>;
+    def r8:    LoadAForm<R8C>;
+  }
+
+  class LoadXFormVec<ValueType vectype>
+    : RRForm<0b00100011100, (outs VECREG:$rT), (ins memrr:$src),
+             "lqx\t$rT, $src",
+             LoadStore,
+             [(set (vectype VECREG:$rT), (load xform_addr:$src))]>
+  { }
+
+  class LoadXForm<RegisterClass rclass>
+    : RRForm<0b00100011100, (outs rclass:$rT), (ins memrr:$src),
+             "lqx\t$rT, $src",
+             LoadStore,
+             [(set rclass:$rT, (load xform_addr:$src))]>
+  { }
+
+  multiclass LoadXForms
+  {
+    def v16i8: LoadXFormVec<v16i8>;
+    def v8i16: LoadXFormVec<v8i16>;
+    def v4i32: LoadXFormVec<v4i32>;
+    def v2i64: LoadXFormVec<v2i64>;
+    def v4f32: LoadXFormVec<v4f32>;
+    def v2f64: LoadXFormVec<v2f64>;
+
+    def r128:  LoadXForm<GPRC>;
+    def r64:   LoadXForm<R64C>;
+    def r32:   LoadXForm<R32C>;
+    def f32:   LoadXForm<R32FP>;
+    def f64:   LoadXForm<R64FP>;
+    def r16:   LoadXForm<R16C>;
+    def r8:    LoadXForm<R8C>;
+  }
+
+  defm LQA : LoadAForms;
+  defm LQD : LoadDForms;
+  defm LQX : LoadXForms;
+
+/* Load quadword, PC relative: Not much use at this point in time.
+   Might be of use later for relocatable code. It's effectively the
+   same as LQA, but uses PC-relative addressing.
+  def LQR : RI16Form<0b111001100, (outs VECREG:$rT), (ins s16imm:$disp),
+                     "lqr\t$rT, $disp", LoadStore,
+                     [(set VECREG:$rT, (load iaddr:$disp))]>;
+ */
+}
+
+//===----------------------------------------------------------------------===//
+// Stores:
+//===----------------------------------------------------------------------===//
+class StoreDFormVec<ValueType vectype>
+  : RI10Form<0b00100100, (outs), (ins VECREG:$rT, dformaddr:$src),
+             "stqd\t$rT, $src",
+             LoadStore,
+             [(store (vectype VECREG:$rT), dform_addr:$src)]>
+{ }
+
+class StoreDForm<RegisterClass rclass>
+  : RI10Form<0b00100100, (outs), (ins rclass:$rT, dformaddr:$src),
+             "stqd\t$rT, $src",
+             LoadStore,
+             [(store rclass:$rT, dform_addr:$src)]>
+{ }
+
+multiclass StoreDForms
+{
+  def v16i8: StoreDFormVec<v16i8>;
+  def v8i16: StoreDFormVec<v8i16>;
+  def v4i32: StoreDFormVec<v4i32>;
+  def v2i64: StoreDFormVec<v2i64>;
+  def v4f32: StoreDFormVec<v4f32>;
+  def v2f64: StoreDFormVec<v2f64>;
+
+  def r128:  StoreDForm<GPRC>;
+  def r64:   StoreDForm<R64C>;
+  def r32:   StoreDForm<R32C>;
+  def f32:   StoreDForm<R32FP>;
+  def f64:   StoreDForm<R64FP>;
+  def r16:   StoreDForm<R16C>;
+  def r8:    StoreDForm<R8C>;
+}
+
+class StoreAFormVec<ValueType vectype>
+  : RI16Form<0b0010010, (outs), (ins VECREG:$rT, addr256k:$src),
+             "stqa\t$rT, $src",
+             LoadStore,
+             [(store (vectype VECREG:$rT), aform_addr:$src)]>;
+
+class StoreAForm<RegisterClass rclass>
+  : RI16Form<0b001001, (outs), (ins rclass:$rT, addr256k:$src),
+             "stqa\t$rT, $src",
+             LoadStore,
+             [(store rclass:$rT, aform_addr:$src)]>;
+
+multiclass StoreAForms
+{
+  def v16i8: StoreAFormVec<v16i8>;
+  def v8i16: StoreAFormVec<v8i16>;
+  def v4i32: StoreAFormVec<v4i32>;
+  def v2i64: StoreAFormVec<v2i64>;
+  def v4f32: StoreAFormVec<v4f32>;
+  def v2f64: StoreAFormVec<v2f64>;
+
+  def r128:  StoreAForm<GPRC>;
+  def r64:   StoreAForm<R64C>;
+  def r32:   StoreAForm<R32C>;
+  def f32:   StoreAForm<R32FP>;
+  def f64:   StoreAForm<R64FP>;
+  def r16:   StoreAForm<R16C>;
+  def r8:    StoreAForm<R8C>;
+}
+
+class StoreXFormVec<ValueType vectype>
+  : RRForm<0b00100100, (outs), (ins VECREG:$rT, memrr:$src),
+           "stqx\t$rT, $src",
+           LoadStore,
+           [(store (vectype VECREG:$rT), xform_addr:$src)]>
+{ }
+
+class StoreXForm<RegisterClass rclass>
+  : RRForm<0b00100100, (outs), (ins rclass:$rT, memrr:$src),
+           "stqx\t$rT, $src",
+           LoadStore,
+           [(store rclass:$rT, xform_addr:$src)]>
+{ }
+
+multiclass StoreXForms
+{
+  def v16i8: StoreXFormVec<v16i8>;
+  def v8i16: StoreXFormVec<v8i16>;
+  def v4i32: StoreXFormVec<v4i32>;
+  def v2i64: StoreXFormVec<v2i64>;
+  def v4f32: StoreXFormVec<v4f32>;
+  def v2f64: StoreXFormVec<v2f64>;
+
+  def r128:  StoreXForm<GPRC>;
+  def r64:   StoreXForm<R64C>;
+  def r32:   StoreXForm<R32C>;
+  def f32:   StoreXForm<R32FP>;
+  def f64:   StoreXForm<R64FP>;
+  def r16:   StoreXForm<R16C>;
+  def r8:    StoreXForm<R8C>;
+}
+
+defm STQD : StoreDForms;
+defm STQA : StoreAForms;
+defm STQX : StoreXForms;
+
+/* Store quadword, PC relative: Not much use at this point in time. Might
+   be useful for relocatable code.
+def STQR : RI16Form<0b111000100, (outs), (ins VECREG:$rT, s16imm:$disp),
+                   "stqr\t$rT, $disp", LoadStore,
+                   [(store VECREG:$rT, iaddr:$disp)]>;
+*/
+
+//===----------------------------------------------------------------------===//
+// Generate Controls for Insertion:
+//===----------------------------------------------------------------------===//
+
+def CBD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cbd\t$rT, $src", ShuffleOp,
+    [(set (v16i8 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CBX: RRForm<0b00101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cbx\t$rT, $src", ShuffleOp,
+    [(set (v16i8 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CHD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "chd\t$rT, $src", ShuffleOp,
+    [(set (v8i16 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CHX: RRForm<0b10101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "chx\t$rT, $src", ShuffleOp,
+    [(set (v8i16 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CWD: RI7Form<0b01101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cwd\t$rT, $src", ShuffleOp,
+    [(set (v4i32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CWX: RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cwx\t$rT, $src", ShuffleOp,
+    [(set (v4i32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CWDf32: RI7Form<0b01101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cwd\t$rT, $src", ShuffleOp,
+    [(set (v4f32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CWXf32: RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cwx\t$rT, $src", ShuffleOp,
+    [(set (v4f32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CDD: RI7Form<0b11101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cdd\t$rT, $src", ShuffleOp,
+    [(set (v2i64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CDX: RRForm<0b11101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cdx\t$rT, $src", ShuffleOp,
+    [(set (v2i64 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CDDf64: RI7Form<0b11101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cdd\t$rT, $src", ShuffleOp,
+    [(set (v2f64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CDXf64: RRForm<0b11101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cdx\t$rT, $src", ShuffleOp,
+    [(set (v2f64 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// Constant formation:
+//===----------------------------------------------------------------------===//
+
+def ILHv8i16:
+  RI16Form<0b110000010, (outs VECREG:$rT), (ins s16imm:$val),
+    "ilh\t$rT, $val", ImmLoad,
+    [(set (v8i16 VECREG:$rT), (v8i16 v8i16SExt16Imm:$val))]>;
+
+def ILHr16:
+  RI16Form<0b110000010, (outs R16C:$rT), (ins s16imm:$val),
+    "ilh\t$rT, $val", ImmLoad,
+    [(set R16C:$rT, immSExt16:$val)]>;
+
+// Cell SPU doesn't have a native 8-bit immediate load, but ILH works ("with
+// the right constant")
+def ILHr8:
+  RI16Form<0b110000010, (outs R8C:$rT), (ins s16imm_i8:$val),
+    "ilh\t$rT, $val", ImmLoad,
+    [(set R8C:$rT, immSExt8:$val)]>;
+
+// IL does sign extension!
+
+class ILInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI16Form<0b100000010, OOL, IOL, "il\t$rT, $val",
+           ImmLoad, pattern>;
+
+class ILVecInst<ValueType vectype, Operand immtype, PatLeaf xform>:
+  ILInst<(outs VECREG:$rT), (ins immtype:$val),
+         [(set (vectype VECREG:$rT), (vectype xform:$val))]>;
+
+class ILRegInst<RegisterClass rclass, Operand immtype, PatLeaf xform>:
+  ILInst<(outs rclass:$rT), (ins immtype:$val),
+         [(set rclass:$rT, xform:$val)]>;
+
+multiclass ImmediateLoad
+{
+  def v2i64: ILVecInst<v2i64, s16imm_i64, v2i64SExt16Imm>;
+  def v4i32: ILVecInst<v4i32, s16imm_i32, v4i32SExt16Imm>;
+
+  // TODO: Need v2f64, v4f32
+
+  def r64: ILRegInst<R64C, s16imm_i64, immSExt16>;
+  def r32: ILRegInst<R32C, s16imm_i32, immSExt16>;
+  def f32: ILRegInst<R32FP, s16imm_f32, fpimmSExt16>;
+  def f64: ILRegInst<R64FP, s16imm_f64, fpimmSExt16>;
+}
+
+defm IL : ImmediateLoad;
+
+class ILHUInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI16Form<0b010000010, OOL, IOL, "ilhu\t$rT, $val",
+           ImmLoad, pattern>;
+
+class ILHUVecInst<ValueType vectype, Operand immtype, PatLeaf xform>:
+  ILHUInst<(outs VECREG:$rT), (ins immtype:$val),
+           [(set (vectype VECREG:$rT), (vectype xform:$val))]>;
+
+class ILHURegInst<RegisterClass rclass, Operand immtype, PatLeaf xform>:
+  ILHUInst<(outs rclass:$rT), (ins immtype:$val),
+           [(set rclass:$rT, xform:$val)]>;
+
+multiclass ImmLoadHalfwordUpper
+{
+  def v2i64: ILHUVecInst<v2i64, u16imm_i64, immILHUvec_i64>;
+  def v4i32: ILHUVecInst<v4i32, u16imm_i32, immILHUvec>;
+
+  def r64: ILHURegInst<R64C, u16imm_i64, hi16>;
+  def r32: ILHURegInst<R32C, u16imm_i32, hi16>;
+
+  // Loads the high portion of an address
+  def hi: ILHURegInst<R32C, symbolHi, hi16>;
+
+  // Used in custom lowering constant SFP loads:
+  def f32: ILHURegInst<R32FP, f16imm, hi16_f32>;
+}
+
+defm ILHU : ImmLoadHalfwordUpper;
+
+// Immediate load address (can also be used to load 18-bit unsigned constants,
+// see the zext 16->32 pattern)
+
+class ILAInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI18Form<0b1000010, OOL, IOL, "ila\t$rT, $val",
+           LoadNOP, pattern>;
+
+class ILAVecInst<ValueType vectype, Operand immtype, PatLeaf xform>:
+  ILAInst<(outs VECREG:$rT), (ins immtype:$val),
+          [(set (vectype VECREG:$rT), (vectype xform:$val))]>;
+
+class ILARegInst<RegisterClass rclass, Operand immtype, PatLeaf xform>:
+  ILAInst<(outs rclass:$rT), (ins immtype:$val),
+          [(set rclass:$rT, xform:$val)]>;
+
+multiclass ImmLoadAddress
+{
+  def v2i64: ILAVecInst<v2i64, u18imm, v2i64Uns18Imm>;
+  def v4i32: ILAVecInst<v4i32, u18imm, v4i32Uns18Imm>;
+
+  def r64: ILARegInst<R64C, u18imm_i64, imm18>;
+  def r32: ILARegInst<R32C, u18imm, imm18>;
+  def f32: ILARegInst<R32FP, f18imm, fpimm18>;
+  def f64: ILARegInst<R64FP, f18imm_f64, fpimm18>;
+
+  def hi: ILARegInst<R32C, symbolHi, imm18>;
+  def lo: ILARegInst<R32C, symbolLo, imm18>;
+
+  def lsa: ILAInst<(outs R32C:$rT), (ins symbolLSA:$val),
+                   [(set R32C:$rT, imm18:$val)]>;
+}
+
+defm ILA : ImmLoadAddress;
+
+// Immediate OR, Halfword Lower: The "other" part of loading large constants
+// into 32-bit registers. See the anonymous pattern Pat<(i32 imm:$imm), ...>
+// Note that these are really two operand instructions, but they're encoded
+// as three operands with the first two arguments tied-to each other.
+
+class IOHLInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI16Form<0b100000110, OOL, IOL, "iohl\t$rT, $val",
+           ImmLoad, pattern>,
+  RegConstraint<"$rS = $rT">,
+  NoEncode<"$rS">;
+
+class IOHLVecInst<ValueType vectype, Operand immtype /* , PatLeaf xform */>:
+  IOHLInst<(outs VECREG:$rT), (ins VECREG:$rS, immtype:$val),
+           [/* no pattern */]>;
+
+class IOHLRegInst<RegisterClass rclass, Operand immtype /* , PatLeaf xform */>:
+  IOHLInst<(outs rclass:$rT), (ins rclass:$rS, immtype:$val),
+           [/* no pattern */]>;
+
+multiclass ImmOrHalfwordLower
+{
+  def v2i64: IOHLVecInst<v2i64, u16imm_i64>;
+  def v4i32: IOHLVecInst<v4i32, u16imm_i32>;
+
+  def r32: IOHLRegInst<R32C, i32imm>;
+  def f32: IOHLRegInst<R32FP, f32imm>;
+
+  def lo: IOHLRegInst<R32C, symbolLo>;
+}
+
+defm IOHL: ImmOrHalfwordLower;
+
+// Form select mask for bytes using immediate, used in conjunction with the
+// SELB instruction:
+
+class FSMBIVec<ValueType vectype>:
+  RI16Form<0b101001100, (outs VECREG:$rT), (ins u16imm:$val),
+          "fsmbi\t$rT, $val",
+          SelectOp,
+          [(set (vectype VECREG:$rT), (SPUselmask (i16 immU16:$val)))]>;
+
+multiclass FormSelectMaskBytesImm
+{
+  def v16i8: FSMBIVec<v16i8>;
+  def v8i16: FSMBIVec<v8i16>;
+  def v4i32: FSMBIVec<v4i32>;
+  def v2i64: FSMBIVec<v2i64>;
+}
+
+defm FSMBI : FormSelectMaskBytesImm;
+
+// fsmb: Form select mask for bytes. N.B. Input operand, $rA, is 16-bits
+class FSMBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101100, OOL, IOL, "fsmb\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMBRegInst<RegisterClass rclass, ValueType vectype>:
+    FSMBInst<(outs VECREG:$rT), (ins rclass:$rA),
+             [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMBVecInst<ValueType vectype>:
+    FSMBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+             [(set (vectype VECREG:$rT),
+                   (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskBits {
+  def v16i8_r16: FSMBRegInst<R16C, v16i8>;
+  def v16i8:     FSMBVecInst<v16i8>;
+}
+
+defm FSMB: FormSelectMaskBits;
+
+// fsmh: Form select mask for halfwords. N.B., Input operand, $rA, is
+// only 8-bits wide (even though it's input as 16-bits here)
+
+class FSMHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10101101100, OOL, IOL, "fsmh\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMHRegInst<RegisterClass rclass, ValueType vectype>:
+    FSMHInst<(outs VECREG:$rT), (ins rclass:$rA),
+             [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMHVecInst<ValueType vectype>:
+    FSMHInst<(outs VECREG:$rT), (ins VECREG:$rA),
+             [(set (vectype VECREG:$rT),
+                   (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskHalfword {
+  def v8i16_r16: FSMHRegInst<R16C, v8i16>;
+  def v8i16:     FSMHVecInst<v8i16>;
+}
+
+defm FSMH: FormSelectMaskHalfword;
+
+// fsm: Form select mask for words. Like the other fsm* instructions,
+// only the lower 4 bits of $rA are significant.
+
+class FSMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b00101101100, OOL, IOL, "fsm\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMRegInst<ValueType vectype, RegisterClass rclass>:
+    FSMInst<(outs VECREG:$rT), (ins rclass:$rA),
+            [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMVecInst<ValueType vectype>:
+    FSMInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [(set (vectype VECREG:$rT), (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskWord {
+  def v4i32: FSMVecInst<v4i32>;
+
+  def r32 :  FSMRegInst<v4i32, R32C>;
+  def r16 :  FSMRegInst<v4i32, R16C>;
+}
+
+defm FSM : FormSelectMaskWord;
+
+// Special case when used for i64 math operations
+multiclass FormSelectMaskWord64 {
+  def r32 : FSMRegInst<v2i64, R32C>;
+  def r16 : FSMRegInst<v2i64, R16C>;
+}
+
+defm FSM64 : FormSelectMaskWord64;
+
+//===----------------------------------------------------------------------===//
+// Integer and Logical Operations:
+//===----------------------------------------------------------------------===//
+
+def AHv8i16:
+  RRForm<0b00010011000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "ah\t$rT, $rA, $rB", IntegerOp,
+    [(set (v8i16 VECREG:$rT), (int_spu_si_ah VECREG:$rA, VECREG:$rB))]>;
+
+def : Pat<(add (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (AHv8i16 VECREG:$rA, VECREG:$rB)>;
+
+def AHr16:
+  RRForm<0b00010011000, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+    "ah\t$rT, $rA, $rB", IntegerOp,
+    [(set R16C:$rT, (add R16C:$rA, R16C:$rB))]>;
+
+def AHIvec:
+    RI10Form<0b10111000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "ahi\t$rT, $rA, $val", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (add (v8i16 VECREG:$rA),
+                                     v8i16SExt10Imm:$val))]>;
+
+def AHIr16:
+  RI10Form<0b10111000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+    "ahi\t$rT, $rA, $val", IntegerOp,
+    [(set R16C:$rT, (add R16C:$rA, i16ImmSExt10:$val))]>;
+
+// v4i32, i32 add instruction:
+
+class AInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm<0b00000011000, OOL, IOL,
+         "a\t$rT, $rA, $rB", IntegerOp,
+         pattern>;
+
+class AVecInst<ValueType vectype>:
+  AInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        [(set (vectype VECREG:$rT), (add (vectype VECREG:$rA),
+                                         (vectype VECREG:$rB)))]>;
+
+class ARegInst<RegisterClass rclass>:
+  AInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+        [(set rclass:$rT, (add rclass:$rA, rclass:$rB))]>;
+        
+multiclass AddInstruction {
+  def v4i32: AVecInst<v4i32>;
+  def v16i8: AVecInst<v16i8>;
+  def r32:   ARegInst<R32C>;
+}
+
+defm A : AddInstruction;
+
+class AIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b00111000, OOL, IOL,
+             "ai\t$rT, $rA, $val", IntegerOp,
+             pattern>;
+
+class AIVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+            [(set (vectype VECREG:$rT), (add (vectype VECREG:$rA), immpred:$val))]>;
+
+class AIFPVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+            [/* no pattern */]>;
+
+class AIRegInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+           [(set rclass:$rT, (add rclass:$rA, immpred:$val))]>;
+
+// This is used to add epsilons to floating point numbers in the f32 fdiv code:
+class AIFPInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+           [/* no pattern */]>;
+
+multiclass AddImmediate {
+  def v4i32: AIVecInst<v4i32, v4i32SExt10Imm>;
+
+  def r32: AIRegInst<R32C, i32ImmSExt10>;
+
+  def v4f32: AIFPVecInst<v4f32, v4i32SExt10Imm>;
+  def f32: AIFPInst<R32FP, i32ImmSExt10>;
+}
+
+defm AI : AddImmediate;
+
+def SFHvec:
+    RRForm<0b00010010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "sfh\t$rT, $rA, $rB", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (sub (v8i16 VECREG:$rA),
+                                     (v8i16 VECREG:$rB)))]>;
+
+def SFHr16:
+    RRForm<0b00010010000, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+      "sfh\t$rT, $rA, $rB", IntegerOp,
+      [(set R16C:$rT, (sub R16C:$rB, R16C:$rA))]>;
+
+def SFHIvec:
+    RI10Form<0b10110000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "sfhi\t$rT, $rA, $val", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (sub v8i16SExt10Imm:$val,
+                                     (v8i16 VECREG:$rA)))]>;
+
+def SFHIr16 : RI10Form<0b10110000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+  "sfhi\t$rT, $rA, $val", IntegerOp,
+  [(set R16C:$rT, (sub i16ImmSExt10:$val, R16C:$rA))]>;
+
+def SFvec : RRForm<0b00000010000, (outs VECREG:$rT),
+                                  (ins VECREG:$rA, VECREG:$rB),
+  "sf\t$rT, $rA, $rB", IntegerOp,
+  [(set (v4i32 VECREG:$rT), (sub (v4i32 VECREG:$rB), (v4i32 VECREG:$rA)))]>;
+
+
+def SFr32 : RRForm<0b00000010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+  "sf\t$rT, $rA, $rB", IntegerOp,
+  [(set R32C:$rT, (sub R32C:$rB, R32C:$rA))]>;
+
+def SFIvec:
+    RI10Form<0b00110000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "sfi\t$rT, $rA, $val", IntegerOp,
+      [(set (v4i32 VECREG:$rT), (sub v4i32SExt10Imm:$val,
+                                     (v4i32 VECREG:$rA)))]>;
+
+def SFIr32 : RI10Form<0b00110000, (outs R32C:$rT),
+                                  (ins R32C:$rA, s10imm_i32:$val),
+  "sfi\t$rT, $rA, $val", IntegerOp,
+  [(set R32C:$rT, (sub i32ImmSExt10:$val, R32C:$rA))]>;
+
+// ADDX: only available in vector form, doesn't match a pattern.
+class ADDXInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00000010110, OOL, IOL,
+      "addx\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class ADDXVecInst<ValueType vectype>:
+    ADDXInst<(outs VECREG:$rT),
+             (ins VECREG:$rA, VECREG:$rB, VECREG:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+class ADDXRegInst<RegisterClass rclass>:
+    ADDXInst<(outs rclass:$rT),
+             (ins rclass:$rA, rclass:$rB, rclass:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+multiclass AddExtended {
+  def v2i64 : ADDXVecInst<v2i64>;
+  def v4i32 : ADDXVecInst<v4i32>;
+  def r64 : ADDXRegInst<R64C>;
+  def r32 : ADDXRegInst<R32C>;
+}
+
+defm ADDX : AddExtended;
+
+// CG: Generate carry for add
+class CGInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01000011000, OOL, IOL,
+      "cg\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class CGVecInst<ValueType vectype>:
+    CGInst<(outs VECREG:$rT),
+           (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
+class CGRegInst<RegisterClass rclass>:
+    CGInst<(outs rclass:$rT),
+           (ins rclass:$rA, rclass:$rB),
+           [/* no pattern */]>;
+
+multiclass CarryGenerate {
+  def v2i64 : CGVecInst<v2i64>;
+  def v4i32 : CGVecInst<v4i32>;
+  def r64 : CGRegInst<R64C>;
+  def r32 : CGRegInst<R32C>;
+}
+
+defm CG : CarryGenerate;
+
+// SFX: Subract from, extended. This is used in conjunction with BG to subtract
+// with carry (borrow, in this case)
+class SFXInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10000010110, OOL, IOL,
+      "sfx\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class SFXVecInst<ValueType vectype>:
+    SFXInst<(outs VECREG:$rT),
+            (ins VECREG:$rA, VECREG:$rB, VECREG:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+class SFXRegInst<RegisterClass rclass>:
+    SFXInst<(outs rclass:$rT),
+            (ins rclass:$rA, rclass:$rB, rclass:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+multiclass SubtractExtended {
+  def v2i64 : SFXVecInst<v2i64>;
+  def v4i32 : SFXVecInst<v4i32>;
+  def r64 : SFXRegInst<R64C>;
+  def r32 : SFXRegInst<R32C>;
+}
+
+defm SFX : SubtractExtended;
+
+// BG: only available in vector form, doesn't match a pattern.
+class BGInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01000010000, OOL, IOL,
+      "bg\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class BGVecInst<ValueType vectype>:
+    BGInst<(outs VECREG:$rT),
+           (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
+class BGRegInst<RegisterClass rclass>:
+    BGInst<(outs rclass:$rT),
+           (ins rclass:$rA, rclass:$rB),
+           [/* no pattern */]>;
+
+multiclass BorrowGenerate {
+  def v4i32 : BGVecInst<v4i32>;
+  def v2i64 : BGVecInst<v2i64>;
+  def r64 : BGRegInst<R64C>;
+  def r32 : BGRegInst<R32C>;
+}
+
+defm BG : BorrowGenerate;
+
+// BGX: Borrow generate, extended.
+def BGXvec:
+    RRForm<0b11000010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB,
+                                VECREG:$rCarry),
+      "bgx\t$rT, $rA, $rB", IntegerOp,
+      []>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+// Halfword multiply variants:
+// N.B: These can be used to build up larger quantities (16x16 -> 32)
+
+def MPYv8i16:
+  RRForm<0b00100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "mpy\t$rT, $rA, $rB", IntegerMulDiv,
+    [/* no pattern */]>;
+
+def MPYr16:
+  RRForm<0b00100011110, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+    "mpy\t$rT, $rA, $rB", IntegerMulDiv,
+    [(set R16C:$rT, (mul R16C:$rA, R16C:$rB))]>;
+
+// Unsigned 16-bit multiply:
+
+class MPYUInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00110011110, OOL, IOL,
+      "mpyu\t$rT, $rA, $rB", IntegerMulDiv,
+      pattern>;
+
+def MPYUv4i32:
+  MPYUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
+def MPYUr16:
+  MPYUInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB),
+           [(set R32C:$rT, (mul (zext R16C:$rA), (zext R16C:$rB)))]>;
+
+def MPYUr32:
+  MPYUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+           [/* no pattern */]>;
+
+// mpyi: multiply 16 x s10imm -> 32 result.
+
+class MPYIInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI10Form<0b00101110, OOL, IOL,
+    "mpyi\t$rT, $rA, $val", IntegerMulDiv,
+    pattern>;
+
+def MPYIvec:
+  MPYIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           [(set (v8i16 VECREG:$rT),
+                 (mul (v8i16 VECREG:$rA), v8i16SExt10Imm:$val))]>;
+
+def MPYIr16:
+  MPYIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+           [(set R16C:$rT, (mul R16C:$rA, i16ImmSExt10:$val))]>;
+
+// mpyui: same issues as other multiplies, plus, this doesn't match a
+// pattern... but may be used during target DAG selection or lowering
+
+class MPYUIInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI10Form<0b10101110, OOL, IOL,
+           "mpyui\t$rT, $rA, $val", IntegerMulDiv,
+           pattern>;
+    
+def MPYUIvec:
+  MPYUIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+            []>;
+
+def MPYUIr16:
+  MPYUIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+            []>;
+
+// mpya: 16 x 16 + 16 -> 32 bit result
+class MPYAInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRRForm<0b0011, OOL, IOL,
+          "mpya\t$rT, $rA, $rB, $rC", IntegerMulDiv,
+          pattern>;
+          
+def MPYAv4i32:
+  MPYAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (v4i32 VECREG:$rT),
+                 (add (v4i32 (bitconvert (mul (v8i16 VECREG:$rA),
+                                              (v8i16 VECREG:$rB)))),
+                      (v4i32 VECREG:$rC)))]>;
+
+def MPYAr32:
+  MPYAInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB, R32C:$rC),
+           [(set R32C:$rT, (add (sext (mul R16C:$rA, R16C:$rB)),
+                                R32C:$rC))]>;
+                                
+def MPYAr32_sext:
+  MPYAInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB, R32C:$rC),
+           [(set R32C:$rT, (add (mul (sext R16C:$rA), (sext R16C:$rB)),
+                                R32C:$rC))]>;
+
+def MPYAr32_sextinreg:
+  MPYAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB, R32C:$rC),
+           [(set R32C:$rT, (add (mul (sext_inreg R32C:$rA, i16),
+                                     (sext_inreg R32C:$rB, i16)),
+                                R32C:$rC))]>;
+
+// mpyh: multiply high, used to synthesize 32-bit multiplies
+class MPYHInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm<0b10100011110, OOL, IOL,
+         "mpyh\t$rT, $rA, $rB", IntegerMulDiv,
+         pattern>;
+         
+def MPYHv4i32:
+    MPYHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [/* no pattern */]>;
+
+def MPYHr32:
+    MPYHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+             [/* no pattern */]>;
+
+// mpys: multiply high and shift right (returns the top half of
+// a 16-bit multiply, sign extended to 32 bits.)
+
+class MPYSInst<dag OOL, dag IOL>:
+    RRForm<0b11100011110, OOL, IOL, 
+      "mpys\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYSv4i32:
+    MPYSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYSr16:
+    MPYSInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB)>;
+
+// mpyhh: multiply high-high (returns the 32-bit result from multiplying
+// the top 16 bits of the $rA, $rB)
+
+class MPYHHInst<dag OOL, dag IOL>:
+  RRForm<0b01100011110, OOL, IOL,
+        "mpyhh\t$rT, $rA, $rB", IntegerMulDiv,
+        [/* no pattern */]>;
+        
+def MPYHHv8i16:
+    MPYHHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+
+def MPYHHr32:
+    MPYHHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+// mpyhha: Multiply high-high, add to $rT:
+
+class MPYHHAInst<dag OOL, dag IOL>:
+    RRForm<0b01100010110, OOL, IOL,
+      "mpyhha\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYHHAvec:
+    MPYHHAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHAr32:
+    MPYHHAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+// mpyhhu: Multiply high-high, unsigned, e.g.:
+//
+// +-------+-------+   +-------+-------+   +---------+
+// |  a0   .  a1   | x |  b0   .  b1   | = | a0 x b0 |
+// +-------+-------+   +-------+-------+   +---------+
+//
+// where a0, b0 are the upper 16 bits of the 32-bit word
+
+class MPYHHUInst<dag OOL, dag IOL>:
+    RRForm<0b01110011110, OOL, IOL,
+      "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYHHUv4i32:
+    MPYHHUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHUr32:
+    MPYHHUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+// mpyhhau: Multiply high-high, unsigned
+
+class MPYHHAUInst<dag OOL, dag IOL>:
+    RRForm<0b01110010110, OOL, IOL,
+      "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYHHAUvec:
+    MPYHHAUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHAUr32:
+    MPYHHAUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// clz: Count leading zeroes
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+class CLZInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10100101010, OOL, IOL, "clz\t$rT, $rA",
+             IntegerOp, pattern>;
+
+class CLZRegInst<RegisterClass rclass>:
+    CLZInst<(outs rclass:$rT), (ins rclass:$rA),
+            [(set rclass:$rT, (ctlz rclass:$rA))]>;
+
+class CLZVecInst<ValueType vectype>:
+    CLZInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [(set (vectype VECREG:$rT), (ctlz (vectype VECREG:$rA)))]>;
+
+multiclass CountLeadingZeroes {
+  def v4i32 : CLZVecInst<v4i32>;
+  def r32   : CLZRegInst<R32C>;
+}
+
+defm CLZ : CountLeadingZeroes;
+
+// cntb: Count ones in bytes (aka "population count")
+//
+// NOTE: This instruction is really a vector instruction, but the custom
+// lowering code uses it in unorthodox ways to support CTPOP for other
+// data types!
+
+def CNTBv16i8:
+    RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cntb\t$rT, $rA", IntegerOp,
+      [(set (v16i8 VECREG:$rT), (SPUcntb (v16i8 VECREG:$rA)))]>;
+
+def CNTBv8i16 :
+    RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cntb\t$rT, $rA", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (SPUcntb (v8i16 VECREG:$rA)))]>;
+
+def CNTBv4i32 :
+    RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cntb\t$rT, $rA", IntegerOp,
+      [(set (v4i32 VECREG:$rT), (SPUcntb (v4i32 VECREG:$rA)))]>;
+
+// gbb: Gather the low order bits from each byte in $rA into a single 16-bit
+// quantity stored into $rT's slot 0, upper 16 bits are zeroed, as are
+// slots 1-3.
+//
+// Note: This instruction "pairs" with the fsmb instruction for all of the
+// various types defined here.
+//
+// Note 2: The "VecInst" and "RegInst" forms refer to the result being either
+// a vector or register.
+
+class GBBInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm_1<0b01001101100, OOL, IOL, "gbb\t$rT, $rA", GatherOp, pattern>;
+
+class GBBRegInst<RegisterClass rclass, ValueType vectype>:
+  GBBInst<(outs rclass:$rT), (ins VECREG:$rA),
+          [/* no pattern */]>;
+
+class GBBVecInst<ValueType vectype>:
+  GBBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+          [/* no pattern */]>;
+
+multiclass GatherBitsFromBytes {
+  def v16i8_r32: GBBRegInst<R32C, v16i8>;
+  def v16i8_r16: GBBRegInst<R16C, v16i8>;
+  def v16i8:     GBBVecInst<v16i8>;
+}
+
+defm GBB: GatherBitsFromBytes;
+
+// gbh: Gather all low order bits from each halfword in $rA into a single
+// 8-bit quantity stored in $rT's slot 0, with the upper bits of $rT set to 0
+// and slots 1-3 also set to 0.
+//
+// See notes for GBBInst, above.
+
+class GBHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10001101100, OOL, IOL, "gbh\t$rT, $rA", GatherOp,
+             pattern>;
+
+class GBHRegInst<RegisterClass rclass, ValueType vectype>:
+    GBHInst<(outs rclass:$rT), (ins VECREG:$rA),
+            [/* no pattern */]>;
+
+class GBHVecInst<ValueType vectype>:
+    GBHInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [/* no pattern */]>;
+
+multiclass GatherBitsHalfword {
+  def v8i16_r32: GBHRegInst<R32C, v8i16>;
+  def v8i16_r16: GBHRegInst<R16C, v8i16>;
+  def v8i16:     GBHVecInst<v8i16>;
+}
+
+defm GBH: GatherBitsHalfword;
+
+// gb: Gather all low order bits from each word in $rA into a single
+// 4-bit quantity stored in $rT's slot 0, upper bits in $rT set to 0,
+// as well as slots 1-3.
+//
+// See notes for gbb, above.
+
+class GBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b00001101100, OOL, IOL, "gb\t$rT, $rA", GatherOp,
+             pattern>;
+
+class GBRegInst<RegisterClass rclass, ValueType vectype>:
+    GBInst<(outs rclass:$rT), (ins VECREG:$rA),
+           [/* no pattern */]>;
+
+class GBVecInst<ValueType vectype>:
+    GBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+           [/* no pattern */]>;
+
+multiclass GatherBitsWord {
+  def v4i32_r32: GBRegInst<R32C, v4i32>;
+  def v4i32_r16: GBRegInst<R16C, v4i32>;
+  def v4i32:     GBVecInst<v4i32>;
+}
+
+defm GB: GatherBitsWord;
+
+// avgb: average bytes
+def AVGB:
+    RRForm<0b11001011000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "avgb\t$rT, $rA, $rB", ByteOp,
+      []>;
+
+// absdb: absolute difference of bytes
+def ABSDB:
+    RRForm<0b11001010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "absdb\t$rT, $rA, $rB", ByteOp,
+      []>;
+
+// sumb: sum bytes into halfwords
+def SUMB:
+    RRForm<0b11001010010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "sumb\t$rT, $rA, $rB", ByteOp,
+      []>;
+
+// Sign extension operations:
+class XSBHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101010, OOL, IOL,
+      "xsbh\t$rDst, $rSrc",
+      IntegerOp, pattern>;
+
+class XSBHInRegInst<RegisterClass rclass, list<dag> pattern>:
+    XSBHInst<(outs rclass:$rDst), (ins rclass:$rSrc),
+             pattern>;
+
+multiclass ExtendByteHalfword {
+  def v16i8:     XSBHInst<(outs VECREG:$rDst), (ins VECREG:$rSrc),
+                          [
+                  /*(set (v8i16 VECREG:$rDst), (sext (v8i16 VECREG:$rSrc)))*/]>;
+  def r8:        XSBHInst<(outs R16C:$rDst), (ins R8C:$rSrc),
+                          [(set R16C:$rDst, (sext R8C:$rSrc))]>;
+  def r16:       XSBHInRegInst<R16C,
+                               [(set R16C:$rDst, (sext_inreg R16C:$rSrc, i8))]>;
+
+  // 32-bit form for XSBH: used to sign extend 8-bit quantities to 16-bit
+  // quantities to 32-bit quantities via a 32-bit register (see the sext 8->32
+  // pattern below). Intentionally doesn't match a pattern because we want the
+  // sext 8->32 pattern to do the work for us, namely because we need the extra
+  // XSHWr32.
+  def r32:   XSBHInRegInst<R32C, [/* no pattern */]>;
+  
+  // Same as the 32-bit version, but for i64
+  def r64:   XSBHInRegInst<R64C, [/* no pattern */]>;
+}
+
+defm XSBH : ExtendByteHalfword;
+
+// Sign extend halfwords to words:
+
+class XSHWInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101010, OOL, IOL, "xshw\t$rDest, $rSrc",
+            IntegerOp, pattern>;
+
+class XSHWVecInst<ValueType in_vectype, ValueType out_vectype>:
+    XSHWInst<(outs VECREG:$rDest), (ins VECREG:$rSrc),
+             [(set (out_vectype VECREG:$rDest),
+                   (sext (in_vectype VECREG:$rSrc)))]>;
+
+class XSHWInRegInst<RegisterClass rclass, list<dag> pattern>:
+    XSHWInst<(outs rclass:$rDest), (ins rclass:$rSrc),
+             pattern>;
+             
+class XSHWRegInst<RegisterClass rclass>:
+    XSHWInst<(outs rclass:$rDest), (ins R16C:$rSrc),
+             [(set rclass:$rDest, (sext R16C:$rSrc))]>;
+
+multiclass ExtendHalfwordWord {
+  def v4i32: XSHWVecInst<v8i16, v4i32>;
+
+  def r16:   XSHWRegInst<R32C>;
+
+  def r32:   XSHWInRegInst<R32C,
+                          [(set R32C:$rDest, (sext_inreg R32C:$rSrc, i16))]>;
+  def r64:   XSHWInRegInst<R64C, [/* no pattern */]>;
+}
+
+defm XSHW : ExtendHalfwordWord;
+
+// Sign-extend words to doublewords (32->64 bits)
+
+class XSWDInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01100101010, OOL, IOL, "xswd\t$rDst, $rSrc",
+              IntegerOp, pattern>;
+      
+class XSWDVecInst<ValueType in_vectype, ValueType out_vectype>:
+    XSWDInst<(outs VECREG:$rDst), (ins VECREG:$rSrc),
+             [/*(set (out_vectype VECREG:$rDst),
+                   (sext (out_vectype VECREG:$rSrc)))*/]>;
+      
+class XSWDRegInst<RegisterClass in_rclass, RegisterClass out_rclass>:
+    XSWDInst<(outs out_rclass:$rDst), (ins in_rclass:$rSrc),
+             [(set out_rclass:$rDst, (sext in_rclass:$rSrc))]>;
+             
+multiclass ExtendWordToDoubleWord {
+  def v2i64: XSWDVecInst<v4i32, v2i64>;
+  def r64:   XSWDRegInst<R32C, R64C>;
+  
+  def r64_inreg: XSWDInst<(outs R64C:$rDst), (ins R64C:$rSrc),
+                          [(set R64C:$rDst, (sext_inreg R64C:$rSrc, i32))]>;
+}
+
+defm XSWD : ExtendWordToDoubleWord;
+
+// AND operations
+
+class ANDInst<dag OOL, dag IOL, list<dag> pattern> :
+    RRForm<0b10000011000, OOL, IOL, "and\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ANDVecInst<ValueType vectype>:
+    ANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT), (and (vectype VECREG:$rA),
+                                              (vectype VECREG:$rB)))]>;
+
+class ANDRegInst<RegisterClass rclass>:
+    ANDInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (and rclass:$rA, rclass:$rB))]>;
+
+multiclass BitwiseAnd
+{
+  def v16i8: ANDVecInst<v16i8>;
+  def v8i16: ANDVecInst<v8i16>;
+  def v4i32: ANDVecInst<v4i32>;
+  def v2i64: ANDVecInst<v2i64>;
+
+  def r128:  ANDRegInst<GPRC>;
+  def r64:   ANDRegInst<R64C>;
+  def r32:   ANDRegInst<R32C>;
+  def r16:   ANDRegInst<R16C>;
+  def r8:    ANDRegInst<R8C>;
+
+  //===---------------------------------------------
+  // Special instructions to perform the fabs instruction
+  def fabs32: ANDInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
+                      [/* Intentionally does not match a pattern */]>;
+
+  def fabs64: ANDInst<(outs R64FP:$rT), (ins R64FP:$rA, R64C:$rB),
+                      [/* Intentionally does not match a pattern */]>;
+
+  def fabsvec: ANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [/* Intentionally does not match a pattern */]>;
+
+  //===---------------------------------------------
+
+  // Hacked form of AND to zero-extend 16-bit quantities to 32-bit
+  // quantities -- see 16->32 zext pattern.
+  //
+  // This pattern is somewhat artificial, since it might match some
+  // compiler generated pattern but it is unlikely to do so.
+
+  def i16i32: ANDInst<(outs R32C:$rT), (ins R16C:$rA, R32C:$rB),
+                      [(set R32C:$rT, (and (zext R16C:$rA), R32C:$rB))]>;
+}
+
+defm AND : BitwiseAnd;
+
+
+def vnot_cell_conv : PatFrag<(ops node:$in),
+                             (xor node:$in, (bitconvert (v4i32 immAllOnesV)))>;
+
+// N.B.: vnot_cell_conv is one of those special target selection pattern
+// fragments,
+// in which we expect there to be a bit_convert on the constant. Bear in mind
+// that llvm translates "not <reg>" to "xor <reg>, -1" (or in this case, a
+// constant -1 vector.)
+
+class ANDCInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10000011010, OOL, IOL, "andc\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ANDCVecInst<ValueType vectype, PatFrag vnot_frag = vnot>:
+    ANDCInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (and (vectype VECREG:$rA),
+                        (vnot_frag (vectype VECREG:$rB))))]>;
+
+class ANDCRegInst<RegisterClass rclass>:
+    ANDCInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (and rclass:$rA, (not rclass:$rB)))]>;
+
+multiclass AndComplement
+{
+  def v16i8: ANDCVecInst<v16i8>;
+  def v8i16: ANDCVecInst<v8i16>;
+  def v4i32: ANDCVecInst<v4i32>;
+  def v2i64: ANDCVecInst<v2i64>;
+
+  def r128: ANDCRegInst<GPRC>;
+  def r64:  ANDCRegInst<R64C>;
+  def r32:  ANDCRegInst<R32C>;
+  def r16:  ANDCRegInst<R16C>;
+  def r8:   ANDCRegInst<R8C>;
+
+  // Sometimes, the xor pattern has a bitcast constant:
+  def v16i8_conv: ANDCVecInst<v16i8, vnot_cell_conv>;
+}
+
+defm ANDC : AndComplement;
+
+class ANDBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b01101000, OOL, IOL, "andbi\t$rT, $rA, $val",
+             ByteOp, pattern>;
+
+multiclass AndByteImm
+{
+  def v16i8: ANDBIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+                       [(set (v16i8 VECREG:$rT),
+                             (and (v16i8 VECREG:$rA),
+                                  (v16i8 v16i8U8Imm:$val)))]>;
+
+  def r8: ANDBIInst<(outs R8C:$rT), (ins R8C:$rA, u10imm_i8:$val),
+                    [(set R8C:$rT, (and R8C:$rA, immU8:$val))]>;
+}
+
+defm ANDBI : AndByteImm;
+
+class ANDHIInst<dag OOL, dag IOL, list<dag> pattern> :
+    RI10Form<0b10101000, OOL, IOL, "andhi\t$rT, $rA, $val",
+             ByteOp, pattern>;
+
+multiclass AndHalfwordImm
+{
+  def v8i16: ANDHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v8i16 VECREG:$rT),
+                             (and (v8i16 VECREG:$rA), v8i16SExt10Imm:$val))]>;
+
+  def r16: ANDHIInst<(outs R16C:$rT), (ins R16C:$rA, u10imm:$val),
+                     [(set R16C:$rT, (and R16C:$rA, i16ImmUns10:$val))]>;
+
+  // Zero-extend i8 to i16:
+  def i8i16: ANDHIInst<(outs R16C:$rT), (ins R8C:$rA, u10imm:$val),
+                      [(set R16C:$rT, (and (zext R8C:$rA), i16ImmUns10:$val))]>;
+}
+
+defm ANDHI : AndHalfwordImm;
+
+class ANDIInst<dag OOL, dag IOL, list<dag> pattern> :
+    RI10Form<0b00101000, OOL, IOL, "andi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+multiclass AndWordImm
+{
+  def v4i32: ANDIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                      [(set (v4i32 VECREG:$rT),
+                            (and (v4i32 VECREG:$rA), v4i32SExt10Imm:$val))]>;
+
+  def r32: ANDIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                    [(set R32C:$rT, (and R32C:$rA, i32ImmSExt10:$val))]>;
+
+  // Hacked form of ANDI to zero-extend i8 quantities to i32. See the zext 8->32
+  // pattern below.
+  def i8i32: ANDIInst<(outs R32C:$rT), (ins R8C:$rA, s10imm_i32:$val),
+                      [(set R32C:$rT,
+                            (and (zext R8C:$rA), i32ImmSExt10:$val))]>;
+
+  // Hacked form of ANDI to zero-extend i16 quantities to i32. See the
+  // zext 16->32 pattern below.
+  //
+  // Note that this pattern is somewhat artificial, since it might match
+  // something the compiler generates but is unlikely to occur in practice.
+  def i16i32: ANDIInst<(outs R32C:$rT), (ins R16C:$rA, s10imm_i32:$val),
+                       [(set R32C:$rT,
+                             (and (zext R16C:$rA), i32ImmSExt10:$val))]>;
+}
+
+defm ANDI : AndWordImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Bitwise OR group:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// Bitwise "or" (N.B.: These are also register-register copy instructions...)
+class ORInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10000010000, OOL, IOL, "or\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ORVecInst<ValueType vectype>:
+    ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                           (vectype VECREG:$rB)))]>;
+
+class ORRegInst<RegisterClass rclass>:
+    ORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+           [(set rclass:$rT, (or rclass:$rA, rclass:$rB))]>;
+
+
+multiclass BitwiseOr
+{
+  def v16i8: ORVecInst<v16i8>;
+  def v8i16: ORVecInst<v8i16>;
+  def v4i32: ORVecInst<v4i32>;
+  def v2i64: ORVecInst<v2i64>;
+
+  def v4f32: ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                    [(set (v4f32 VECREG:$rT),
+                          (v4f32 (bitconvert (or (v4i32 VECREG:$rA),
+                                                 (v4i32 VECREG:$rB)))))]>;
+
+  def v2f64: ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                    [(set (v2f64 VECREG:$rT),
+                          (v2f64 (bitconvert (or (v2i64 VECREG:$rA),
+                                                 (v2i64 VECREG:$rB)))))]>;
+
+  def r128: ORRegInst<GPRC>;
+  def r64:  ORRegInst<R64C>;
+  def r32:  ORRegInst<R32C>;
+  def r16:  ORRegInst<R16C>;
+  def r8:   ORRegInst<R8C>;
+
+  // OR instructions used to copy f32 and f64 registers.
+  def f32: ORInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                  [/* no pattern */]>;
+
+  def f64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+                  [/* no pattern */]>;
+}
+
+defm OR : BitwiseOr;
+
+//===----------------------------------------------------------------------===//
+// SPU::PREFSLOT2VEC and VEC2PREFSLOT re-interpretations of registers
+//===----------------------------------------------------------------------===//
+def : Pat<(v16i8 (SPUprefslot2vec R8C:$rA)),
+          (COPY_TO_REGCLASS R8C:$rA, VECREG)>;
+
+def : Pat<(v8i16 (SPUprefslot2vec R16C:$rA)),
+          (COPY_TO_REGCLASS R16C:$rA, VECREG)>;
+
+def : Pat<(v4i32 (SPUprefslot2vec R32C:$rA)),
+          (COPY_TO_REGCLASS R32C:$rA, VECREG)>;
+
+def : Pat<(v2i64 (SPUprefslot2vec R64C:$rA)),
+          (COPY_TO_REGCLASS R64C:$rA, VECREG)>;
+
+def : Pat<(v4f32 (SPUprefslot2vec R32FP:$rA)),
+          (COPY_TO_REGCLASS R32FP:$rA, VECREG)>;
+
+def : Pat<(v2f64 (SPUprefslot2vec R64FP:$rA)),
+          (COPY_TO_REGCLASS R64FP:$rA, VECREG)>;
+ 
+def : Pat<(i8 (SPUvec2prefslot (v16i8 VECREG:$rA))),
+          (COPY_TO_REGCLASS (v16i8 VECREG:$rA), R8C)>;
+
+def : Pat<(i16 (SPUvec2prefslot (v8i16 VECREG:$rA))),
+          (COPY_TO_REGCLASS (v8i16 VECREG:$rA), R16C)>;
+
+def : Pat<(i32 (SPUvec2prefslot (v4i32 VECREG:$rA))),
+          (COPY_TO_REGCLASS (v4i32 VECREG:$rA), R32C)>;
+
+def : Pat<(i64 (SPUvec2prefslot (v2i64 VECREG:$rA))),
+          (COPY_TO_REGCLASS (v2i64 VECREG:$rA), R64C)>;
+
+def : Pat<(f32 (SPUvec2prefslot (v4f32 VECREG:$rA))),
+          (COPY_TO_REGCLASS (v4f32 VECREG:$rA), R32FP)>;
+
+def : Pat<(f64 (SPUvec2prefslot (v2f64 VECREG:$rA))),
+          (COPY_TO_REGCLASS (v2f64 VECREG:$rA), R64FP)>;
+
+// Load Register: This is an assembler alias for a bitwise OR of a register
+// against itself. It's here because it brings some clarity to assembly
+// language output.
+
+let hasCtrlDep = 1 in {
+    class LRInst<dag OOL, dag IOL>
+              : SPUInstr<OOL, IOL, "lr\t$rT, $rA", IntegerOp> {
+      bits<7> RA;
+      bits<7> RT;
+
+      let Pattern = [/*no pattern*/];
+
+      let Inst{0-10} = 0b10000010000;   /* It's an OR operation */
+      let Inst{11-17} = RA;
+      let Inst{18-24} = RA;
+      let Inst{25-31} = RT;
+    }
+
+    class LRVecInst<ValueType vectype>:
+        LRInst<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
+    class LRRegInst<RegisterClass rclass>:
+        LRInst<(outs rclass:$rT), (ins rclass:$rA)>;
+
+    multiclass LoadRegister {
+      def v2i64: LRVecInst<v2i64>;
+      def v2f64: LRVecInst<v2f64>;
+      def v4i32: LRVecInst<v4i32>;
+      def v4f32: LRVecInst<v4f32>;
+      def v8i16: LRVecInst<v8i16>;
+      def v16i8: LRVecInst<v16i8>;
+
+      def r128:  LRRegInst<GPRC>;
+      def r64:   LRRegInst<R64C>;
+      def f64:   LRRegInst<R64FP>;
+      def r32:   LRRegInst<R32C>;
+      def f32:   LRRegInst<R32FP>;
+      def r16:   LRRegInst<R16C>;
+      def r8:    LRRegInst<R8C>;
+    }
+
+    defm LR: LoadRegister;
+}
+
+// ORC: Bitwise "or" with complement (c = a | ~b)
+
+class ORCInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010010000, OOL, IOL, "orc\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ORCVecInst<ValueType vectype>:
+    ORCInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+            [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                            (vnot (vectype VECREG:$rB))))]>;
+
+class ORCRegInst<RegisterClass rclass>:
+  ORCInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT, (or rclass:$rA, (not rclass:$rB)))]>;
+
+multiclass BitwiseOrComplement
+{
+  def v16i8: ORCVecInst<v16i8>;
+  def v8i16: ORCVecInst<v8i16>;
+  def v4i32: ORCVecInst<v4i32>;
+  def v2i64: ORCVecInst<v2i64>;
+
+  def r128:  ORCRegInst<GPRC>;
+  def r64:   ORCRegInst<R64C>;
+  def r32:   ORCRegInst<R32C>;
+  def r16:   ORCRegInst<R16C>;
+  def r8:    ORCRegInst<R8C>;
+}
+
+defm ORC : BitwiseOrComplement;
+
+// OR byte immediate
+class ORBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b01100000, OOL, IOL, "orbi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+class ORBIVecInst<ValueType vectype, PatLeaf immpred>:
+    ORBIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+             [(set (v16i8 VECREG:$rT), (or (vectype VECREG:$rA),
+                                           (vectype immpred:$val)))]>;
+
+multiclass BitwiseOrByteImm
+{
+  def v16i8: ORBIVecInst<v16i8, v16i8U8Imm>;
+
+  def r8: ORBIInst<(outs R8C:$rT), (ins R8C:$rA, u10imm_i8:$val),
+                   [(set R8C:$rT, (or R8C:$rA, immU8:$val))]>;
+}
+
+defm ORBI : BitwiseOrByteImm;
+
+// OR halfword immediate
+class ORHIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b10100000, OOL, IOL, "orhi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+class ORHIVecInst<ValueType vectype, PatLeaf immpred>:
+    ORHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+              [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                              immpred:$val))]>;
+
+multiclass BitwiseOrHalfwordImm
+{
+  def v8i16: ORHIVecInst<v8i16, v8i16Uns10Imm>;
+
+  def r16: ORHIInst<(outs R16C:$rT), (ins R16C:$rA, u10imm:$val),
+                    [(set R16C:$rT, (or R16C:$rA, i16ImmUns10:$val))]>;
+
+  // Specialized ORHI form used to promote 8-bit registers to 16-bit
+  def i8i16: ORHIInst<(outs R16C:$rT), (ins R8C:$rA, s10imm:$val),
+                      [(set R16C:$rT, (or (anyext R8C:$rA),
+                                          i16ImmSExt10:$val))]>;
+}
+
+defm ORHI : BitwiseOrHalfwordImm;
+
+class ORIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b00100000, OOL, IOL, "ori\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+class ORIVecInst<ValueType vectype, PatLeaf immpred>:
+    ORIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+            [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                            immpred:$val))]>;
+
+// Bitwise "or" with immediate
+multiclass BitwiseOrImm
+{
+  def v4i32: ORIVecInst<v4i32, v4i32Uns10Imm>;
+
+  def r32: ORIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                   [(set R32C:$rT, (or R32C:$rA, i32ImmSExt10:$val))]>;
+
+  // i16i32: hacked version of the ori instruction to extend 16-bit quantities
+  // to 32-bit quantities. used exclusively to match "anyext" conversions (vide
+  // infra "anyext 16->32" pattern.)
+  def i16i32: ORIInst<(outs R32C:$rT), (ins R16C:$rA, s10imm_i32:$val),
+                      [(set R32C:$rT, (or (anyext R16C:$rA),
+                                          i32ImmSExt10:$val))]>;
+
+  // i8i32: Hacked version of the ORI instruction to extend 16-bit quantities
+  // to 32-bit quantities. Used exclusively to match "anyext" conversions (vide
+  // infra "anyext 16->32" pattern.)
+  def i8i32: ORIInst<(outs R32C:$rT), (ins R8C:$rA, s10imm_i32:$val),
+                     [(set R32C:$rT, (or (anyext R8C:$rA),
+                                         i32ImmSExt10:$val))]>;
+}
+
+defm ORI : BitwiseOrImm;
+
+// ORX: "or" across the vector: or's $rA's word slots leaving the result in
+// $rT[0], slots 1-3 are zeroed.
+//
+// FIXME: Needs to match an intrinsic pattern.
+def ORXv4i32:
+    RRForm<0b10010010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "orx\t$rT, $rA, $rB", IntegerOp,
+      []>;
+
+// XOR:
+
+class XORInst<dag OOL, dag IOL, list<dag> pattern> :
+    RRForm<0b10010010000, OOL, IOL, "xor\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class XORVecInst<ValueType vectype>:
+    XORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT), (xor (vectype VECREG:$rA),
+                                              (vectype VECREG:$rB)))]>;
+
+class XORRegInst<RegisterClass rclass>:
+    XORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (xor rclass:$rA, rclass:$rB))]>;
+
+multiclass BitwiseExclusiveOr
+{
+  def v16i8: XORVecInst<v16i8>;
+  def v8i16: XORVecInst<v8i16>;
+  def v4i32: XORVecInst<v4i32>;
+  def v2i64: XORVecInst<v2i64>;
+
+  def r128:  XORRegInst<GPRC>;
+  def r64:   XORRegInst<R64C>;
+  def r32:   XORRegInst<R32C>;
+  def r16:   XORRegInst<R16C>;
+  def r8:    XORRegInst<R8C>;
+
+  // XOR instructions used to negate f32 and f64 quantities.
+
+  def fneg32: XORInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
+                     [/* no pattern */]>;
+
+  def fneg64: XORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64C:$rB),
+                     [/* no pattern */]>;
+
+  def fnegvec: XORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [/* no pattern, see fneg{32,64} */]>;
+}
+
+defm XOR : BitwiseExclusiveOr;
+
+//==----------------------------------------------------------
+
+class XORBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b01100000, OOL, IOL, "xorbi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+multiclass XorByteImm
+{
+  def v16i8:
+    XORBIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+              [(set (v16i8 VECREG:$rT), (xor (v16i8 VECREG:$rA), v16i8U8Imm:$val))]>;
+
+  def r8:
+    XORBIInst<(outs R8C:$rT), (ins R8C:$rA, u10imm_i8:$val),
+              [(set R8C:$rT, (xor R8C:$rA, immU8:$val))]>;
+}
+
+defm XORBI : XorByteImm;
+
+def XORHIv8i16:
+    RI10Form<0b10100000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+      "xorhi\t$rT, $rA, $val", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (xor (v8i16 VECREG:$rA),
+                                      v8i16SExt10Imm:$val))]>;
+
+def XORHIr16:
+    RI10Form<0b10100000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+      "xorhi\t$rT, $rA, $val", IntegerOp,
+      [(set R16C:$rT, (xor R16C:$rA, i16ImmSExt10:$val))]>;
+
+def XORIv4i32:
+    RI10Form<0b00100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm_i32:$val),
+      "xori\t$rT, $rA, $val", IntegerOp,
+      [(set (v4i32 VECREG:$rT), (xor (v4i32 VECREG:$rA),
+                                     v4i32SExt10Imm:$val))]>;
+
+def XORIr32:
+    RI10Form<0b00100000, (outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+      "xori\t$rT, $rA, $val", IntegerOp,
+      [(set R32C:$rT, (xor R32C:$rA, i32ImmSExt10:$val))]>;
+
+// NAND:
+
+class NANDInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010011000, OOL, IOL, "nand\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class NANDVecInst<ValueType vectype>:
+    NANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT), (vnot (and (vectype VECREG:$rA),
+                                                    (vectype VECREG:$rB))))]>;
+class NANDRegInst<RegisterClass rclass>:
+    NANDInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (not (and rclass:$rA, rclass:$rB)))]>;
+
+multiclass BitwiseNand
+{
+  def v16i8: NANDVecInst<v16i8>;
+  def v8i16: NANDVecInst<v8i16>;
+  def v4i32: NANDVecInst<v4i32>;
+  def v2i64: NANDVecInst<v2i64>;
+
+  def r128:  NANDRegInst<GPRC>;
+  def r64:   NANDRegInst<R64C>;
+  def r32:   NANDRegInst<R32C>;
+  def r16:   NANDRegInst<R16C>;
+  def r8:    NANDRegInst<R8C>;
+}
+
+defm NAND : BitwiseNand;
+
+// NOR:
+
+class NORInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010010000, OOL, IOL, "nor\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class NORVecInst<ValueType vectype>:
+    NORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+            [(set (vectype VECREG:$rT), (vnot (or (vectype VECREG:$rA),
+                                                  (vectype VECREG:$rB))))]>;
+class NORRegInst<RegisterClass rclass>:
+    NORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+            [(set rclass:$rT, (not (or rclass:$rA, rclass:$rB)))]>;
+
+multiclass BitwiseNor
+{
+  def v16i8: NORVecInst<v16i8>;
+  def v8i16: NORVecInst<v8i16>;
+  def v4i32: NORVecInst<v4i32>;
+  def v2i64: NORVecInst<v2i64>;
+
+  def r128:  NORRegInst<GPRC>;
+  def r64:   NORRegInst<R64C>;
+  def r32:   NORRegInst<R32C>;
+  def r16:   NORRegInst<R16C>;
+  def r8:    NORRegInst<R8C>;
+}
+
+defm NOR : BitwiseNor;
+
+// Select bits:
+class SELBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRRForm<0b1000, OOL, IOL, "selb\t$rT, $rA, $rB, $rC",
+            IntegerOp, pattern>;
+
+class SELBVecInst<ValueType vectype, PatFrag vnot_frag = vnot>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (vectype VECREG:$rT),
+                 (or (and (vectype VECREG:$rC), (vectype VECREG:$rB)),
+                     (and (vnot_frag (vectype VECREG:$rC)),
+                          (vectype VECREG:$rA))))]>;
+
+class SELBVecVCondInst<ValueType vectype>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (vectype VECREG:$rT),
+                 (select (vectype VECREG:$rC),
+                         (vectype VECREG:$rB),
+                         (vectype VECREG:$rA)))]>;
+
+class SELBVecCondInst<ValueType vectype>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, R32C:$rC),
+           [(set (vectype VECREG:$rT),
+                 (select R32C:$rC,
+                         (vectype VECREG:$rB),
+                         (vectype VECREG:$rA)))]>;
+
+class SELBRegInst<RegisterClass rclass>:
+  SELBInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB, rclass:$rC),
+           [(set rclass:$rT,
+                 (or (and rclass:$rB, rclass:$rC),
+                     (and rclass:$rA, (not rclass:$rC))))]>;
+
+class SELBRegCondInst<RegisterClass rcond, RegisterClass rclass>:
+  SELBInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB, rcond:$rC),
+           [(set rclass:$rT,
+                 (select rcond:$rC, rclass:$rB, rclass:$rA))]>;
+
+multiclass SelectBits
+{
+  def v16i8: SELBVecInst<v16i8>;
+  def v8i16: SELBVecInst<v8i16>;
+  def v4i32: SELBVecInst<v4i32>;
+  def v2i64: SELBVecInst<v2i64, vnot_cell_conv>;
+
+  def r128:  SELBRegInst<GPRC>;
+  def r64:   SELBRegInst<R64C>;
+  def r32:   SELBRegInst<R32C>;
+  def r16:   SELBRegInst<R16C>;
+  def r8:    SELBRegInst<R8C>;
+
+  def v16i8_cond: SELBVecCondInst<v16i8>;
+  def v8i16_cond: SELBVecCondInst<v8i16>;
+  def v4i32_cond: SELBVecCondInst<v4i32>;
+  def v2i64_cond: SELBVecCondInst<v2i64>;
+
+  def v16i8_vcond: SELBVecCondInst<v16i8>;
+  def v8i16_vcond: SELBVecCondInst<v8i16>;
+  def v4i32_vcond: SELBVecCondInst<v4i32>;
+  def v2i64_vcond: SELBVecCondInst<v2i64>;
+
+  def v4f32_cond:
+        SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+                 [(set (v4f32 VECREG:$rT),
+                       (select (v4i32 VECREG:$rC),
+                               (v4f32 VECREG:$rB),
+                               (v4f32 VECREG:$rA)))]>;
+
+  // SELBr64_cond is defined in SPU64InstrInfo.td
+  def r32_cond:   SELBRegCondInst<R32C, R32C>;
+  def f32_cond:   SELBRegCondInst<R32C, R32FP>;
+  def r16_cond:   SELBRegCondInst<R16C, R16C>;
+  def r8_cond:    SELBRegCondInst<R8C,  R8C>;
+}
+
+defm SELB : SelectBits;
+
+class SPUselbPatVec<ValueType vectype, SPUInstr inst>:
+   Pat<(SPUselb (vectype VECREG:$rA), (vectype VECREG:$rB), (vectype VECREG:$rC)),
+       (inst VECREG:$rA, VECREG:$rB, VECREG:$rC)>;
+
+def : SPUselbPatVec<v16i8, SELBv16i8>;
+def : SPUselbPatVec<v8i16, SELBv8i16>;
+def : SPUselbPatVec<v4i32, SELBv4i32>;
+def : SPUselbPatVec<v2i64, SELBv2i64>;
+
+class SPUselbPatReg<RegisterClass rclass, SPUInstr inst>:
+   Pat<(SPUselb rclass:$rA, rclass:$rB, rclass:$rC),
+       (inst rclass:$rA, rclass:$rB, rclass:$rC)>;
+
+def : SPUselbPatReg<R8C,   SELBr8>;
+def : SPUselbPatReg<R16C,  SELBr16>;
+def : SPUselbPatReg<R32C,  SELBr32>;
+def : SPUselbPatReg<R64C,  SELBr64>;
+
+// EQV: Equivalence (1 for each same bit, otherwise 0)
+//
+// Note: There are a lot of ways to match this bit operator and these patterns
+// attempt to be as exhaustive as possible.
+
+class EQVInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010010000, OOL, IOL, "eqv\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class EQVVecInst<ValueType vectype>:
+    EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+            [(set (vectype VECREG:$rT),
+                  (or (and (vectype VECREG:$rA), (vectype VECREG:$rB)),
+                      (and (vnot (vectype VECREG:$rA)),
+                           (vnot (vectype VECREG:$rB)))))]>;
+
+class EQVRegInst<RegisterClass rclass>:
+    EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+            [(set rclass:$rT, (or (and rclass:$rA, rclass:$rB),
+                                  (and (not rclass:$rA), (not rclass:$rB))))]>;
+
+class EQVVecPattern1<ValueType vectype>:
+  EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          [(set (vectype VECREG:$rT),
+                (xor (vectype VECREG:$rA), (vnot (vectype VECREG:$rB))))]>;
+
+class EQVRegPattern1<RegisterClass rclass>:
+  EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT, (xor rclass:$rA, (not rclass:$rB)))]>;
+
+class EQVVecPattern2<ValueType vectype>:
+  EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          [(set (vectype VECREG:$rT),
+                (or (and (vectype VECREG:$rA), (vectype VECREG:$rB)),
+                    (vnot (or (vectype VECREG:$rA), (vectype VECREG:$rB)))))]>;
+
+class EQVRegPattern2<RegisterClass rclass>:
+  EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT,
+                (or (and rclass:$rA, rclass:$rB),
+                    (not (or rclass:$rA, rclass:$rB))))]>;
+
+class EQVVecPattern3<ValueType vectype>:
+  EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          [(set (vectype VECREG:$rT),
+                (not (xor (vectype VECREG:$rA), (vectype VECREG:$rB))))]>;
+
+class EQVRegPattern3<RegisterClass rclass>:
+  EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT, (not (xor rclass:$rA, rclass:$rB)))]>;
+
+multiclass BitEquivalence
+{
+  def v16i8: EQVVecInst<v16i8>;
+  def v8i16: EQVVecInst<v8i16>;
+  def v4i32: EQVVecInst<v4i32>;
+  def v2i64: EQVVecInst<v2i64>;
+
+  def v16i8_1: EQVVecPattern1<v16i8>;
+  def v8i16_1: EQVVecPattern1<v8i16>;
+  def v4i32_1: EQVVecPattern1<v4i32>;
+  def v2i64_1: EQVVecPattern1<v2i64>;
+
+  def v16i8_2: EQVVecPattern2<v16i8>;
+  def v8i16_2: EQVVecPattern2<v8i16>;
+  def v4i32_2: EQVVecPattern2<v4i32>;
+  def v2i64_2: EQVVecPattern2<v2i64>;
+
+  def v16i8_3: EQVVecPattern3<v16i8>;
+  def v8i16_3: EQVVecPattern3<v8i16>;
+  def v4i32_3: EQVVecPattern3<v4i32>;
+  def v2i64_3: EQVVecPattern3<v2i64>;
+
+  def r128:  EQVRegInst<GPRC>;
+  def r64:   EQVRegInst<R64C>;
+  def r32:   EQVRegInst<R32C>;
+  def r16:   EQVRegInst<R16C>;
+  def r8:    EQVRegInst<R8C>;
+
+  def r128_1: EQVRegPattern1<GPRC>;
+  def r64_1:  EQVRegPattern1<R64C>;
+  def r32_1:  EQVRegPattern1<R32C>;
+  def r16_1:  EQVRegPattern1<R16C>;
+  def r8_1:   EQVRegPattern1<R8C>;
+
+  def r128_2: EQVRegPattern2<GPRC>;
+  def r64_2:  EQVRegPattern2<R64C>;
+  def r32_2:  EQVRegPattern2<R32C>;
+  def r16_2:  EQVRegPattern2<R16C>;
+  def r8_2:   EQVRegPattern2<R8C>;
+
+  def r128_3: EQVRegPattern3<GPRC>;
+  def r64_3:  EQVRegPattern3<R64C>;
+  def r32_3:  EQVRegPattern3<R32C>;
+  def r16_3:  EQVRegPattern3<R16C>;
+  def r8_3:   EQVRegPattern3<R8C>;
+}
+
+defm EQV: BitEquivalence;
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle...
+//===----------------------------------------------------------------------===//
+// SPUshuffle is generated in LowerVECTOR_SHUFFLE and gets replaced with SHUFB.
+// See the SPUshuffle SDNode operand above, which sets up the DAG pattern
+// matcher to emit something when the LowerVECTOR_SHUFFLE generates a node with
+// the SPUISD::SHUFB opcode.
+//===----------------------------------------------------------------------===//
+
+class SHUFBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRRForm<0b1000, OOL, IOL, "shufb\t$rT, $rA, $rB, $rC",
+            ShuffleOp, pattern>;
+
+class SHUFBVecInst<ValueType resultvec, ValueType maskvec>:
+    SHUFBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+              [(set (resultvec VECREG:$rT),
+                    (SPUshuffle (resultvec VECREG:$rA),
+                                (resultvec VECREG:$rB),
+                                (maskvec VECREG:$rC)))]>;
+
+class SHUFBGPRCInst:
+    SHUFBInst<(outs VECREG:$rT), (ins GPRC:$rA, GPRC:$rB, VECREG:$rC),
+              [/* no pattern */]>;
+
+multiclass ShuffleBytes
+{
+  def v16i8     : SHUFBVecInst<v16i8, v16i8>;
+  def v16i8_m32 : SHUFBVecInst<v16i8, v4i32>;
+  def v8i16     : SHUFBVecInst<v8i16, v16i8>;
+  def v8i16_m32 : SHUFBVecInst<v8i16, v4i32>;
+  def v4i32     : SHUFBVecInst<v4i32, v16i8>;
+  def v4i32_m32 : SHUFBVecInst<v4i32, v4i32>;
+  def v2i64     : SHUFBVecInst<v2i64, v16i8>;
+  def v2i64_m32 : SHUFBVecInst<v2i64, v4i32>;
+
+  def v4f32     : SHUFBVecInst<v4f32, v16i8>;
+  def v4f32_m32 : SHUFBVecInst<v4f32, v4i32>;
+
+  def v2f64     : SHUFBVecInst<v2f64, v16i8>;
+  def v2f64_m32 : SHUFBVecInst<v2f64, v4i32>;
+
+  def gprc      : SHUFBGPRCInst;
+}
+
+defm SHUFB : ShuffleBytes;
+
+//===----------------------------------------------------------------------===//
+// Shift and rotate group:
+//===----------------------------------------------------------------------===//
+
+class SHLHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b11111010000, OOL, IOL, "shlh\t$rT, $rA, $rB",
+           RotShiftVec, pattern>;
+
+class SHLHVecInst<ValueType vectype>:
+    SHLHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (SPUvec_shl (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+
+multiclass ShiftLeftHalfword
+{
+  def v8i16: SHLHVecInst<v8i16>;
+  def r16:   SHLHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                      [(set R16C:$rT, (shl R16C:$rA, R16C:$rB))]>;
+  def r16_r32: SHLHInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+                        [(set R16C:$rT, (shl R16C:$rA, R32C:$rB))]>;
+}
+
+defm SHLH : ShiftLeftHalfword;
+
+//===----------------------------------------------------------------------===//
+
+class SHLHIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111010000, OOL, IOL, "shlhi\t$rT, $rA, $val",
+            RotShiftVec, pattern>;
+
+class SHLHIVecInst<ValueType vectype>:
+    SHLHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val),
+              [(set (vectype VECREG:$rT),
+                    (SPUvec_shl (vectype VECREG:$rA), (i16 uimm7:$val)))]>;
+
+multiclass ShiftLeftHalfwordImm
+{
+  def v8i16: SHLHIVecInst<v8i16>;
+  def r16: SHLHIInst<(outs R16C:$rT), (ins R16C:$rA, u7imm:$val),
+                     [(set R16C:$rT, (shl R16C:$rA, (i16 uimm7:$val)))]>;
+}
+
+defm SHLHI : ShiftLeftHalfwordImm;
+
+def : Pat<(SPUvec_shl (v8i16 VECREG:$rA), (i32 uimm7:$val)),
+          (SHLHIv8i16 VECREG:$rA, (TO_IMM16 uimm7:$val))>;
+
+def : Pat<(shl R16C:$rA, (i32 uimm7:$val)),
+          (SHLHIr16 R16C:$rA, (TO_IMM16 uimm7:$val))>;
+
+//===----------------------------------------------------------------------===//
+
+class SHLInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b11111010000, OOL, IOL, "shl\t$rT, $rA, $rB",
+           RotShiftVec, pattern>;
+
+multiclass ShiftLeftWord
+{
+  def v4i32:
+      SHLInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+              [(set (v4i32 VECREG:$rT),
+                    (SPUvec_shl (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+  def r32:
+      SHLInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+              [(set R32C:$rT, (shl R32C:$rA, R32C:$rB))]>;
+}
+
+defm SHL: ShiftLeftWord;
+
+//===----------------------------------------------------------------------===//
+
+class SHLIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111010000, OOL, IOL, "shli\t$rT, $rA, $val",
+            RotShiftVec, pattern>;
+
+multiclass ShiftLeftWordImm
+{
+  def v4i32:
+    SHLIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val),
+             [(set (v4i32 VECREG:$rT),
+                   (SPUvec_shl (v4i32 VECREG:$rA), (i32 uimm7:$val)))]>;
+
+  def r32:
+    SHLIInst<(outs R32C:$rT), (ins R32C:$rA, u7imm_i32:$val),
+             [(set R32C:$rT, (shl R32C:$rA, (i32 uimm7:$val)))]>;
+}
+
+defm SHLI : ShiftLeftWordImm;
+
+//===----------------------------------------------------------------------===//
+// SHLQBI vec form: Note that this will shift the entire vector (the 128-bit
+// register) to the left. Vector form is here to ensure type correctness.
+//
+// The shift count is in the lowest 3 bits (29-31) of $rB, so only a bit shift
+// of 7 bits is actually possible.
+//
+// Note also that SHLQBI/SHLQBII are used in conjunction with SHLQBY/SHLQBYI
+// to shift i64 and i128. SHLQBI is the residual left over after shifting by
+// bytes with SHLQBY.
+
+class SHLQBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b11011011100, OOL, IOL, "shlqbi\t$rT, $rA, $rB",
+           RotShiftQuad, pattern>;
+
+class SHLQBIVecInst<ValueType vectype>:
+    SHLQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [(set (vectype VECREG:$rT),
+                     (SPUshlquad_l_bits (vectype VECREG:$rA), R32C:$rB))]>;
+
+class SHLQBIRegInst<RegisterClass rclass>:
+    SHLQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+               [/* no pattern */]>;
+
+multiclass ShiftLeftQuadByBits
+{
+  def v16i8: SHLQBIVecInst<v16i8>;
+  def v8i16: SHLQBIVecInst<v8i16>;
+  def v4i32: SHLQBIVecInst<v4i32>;
+  def v4f32: SHLQBIVecInst<v4f32>;
+  def v2i64: SHLQBIVecInst<v2i64>;
+  def v2f64: SHLQBIVecInst<v2f64>;
+
+  def r128:  SHLQBIRegInst<GPRC>;
+}
+
+defm SHLQBI : ShiftLeftQuadByBits;
+
+// See note above on SHLQBI. In this case, the predicate actually does then
+// enforcement, whereas with SHLQBI, we have to "take it on faith."
+class SHLQBIIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11011111100, OOL, IOL, "shlqbii\t$rT, $rA, $val",
+            RotShiftQuad, pattern>;
+
+class SHLQBIIVecInst<ValueType vectype>:
+    SHLQBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val),
+                [(set (vectype VECREG:$rT),
+                      (SPUshlquad_l_bits (vectype VECREG:$rA), (i32 bitshift:$val)))]>;
+
+multiclass ShiftLeftQuadByBitsImm
+{
+  def v16i8 : SHLQBIIVecInst<v16i8>;
+  def v8i16 : SHLQBIIVecInst<v8i16>;
+  def v4i32 : SHLQBIIVecInst<v4i32>;
+  def v4f32 : SHLQBIIVecInst<v4f32>;
+  def v2i64 : SHLQBIIVecInst<v2i64>;
+  def v2f64 : SHLQBIIVecInst<v2f64>;
+}
+
+defm SHLQBII : ShiftLeftQuadByBitsImm;
+
+// SHLQBY, SHLQBYI vector forms: Shift the entire vector to the left by bytes,
+// not by bits. See notes above on SHLQBI.
+
+class SHLQBYInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111011100, OOL, IOL, "shlqby\t$rT, $rA, $rB",
+            RotShiftQuad, pattern>;
+
+class SHLQBYVecInst<ValueType vectype>:
+    SHLQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [(set (vectype VECREG:$rT),
+                     (SPUshlquad_l_bytes (vectype VECREG:$rA), R32C:$rB))]>;
+
+multiclass ShiftLeftQuadBytes
+{
+  def v16i8: SHLQBYVecInst<v16i8>;
+  def v8i16: SHLQBYVecInst<v8i16>;
+  def v4i32: SHLQBYVecInst<v4i32>;
+  def v4f32: SHLQBYVecInst<v4f32>;
+  def v2i64: SHLQBYVecInst<v2i64>;
+  def v2f64: SHLQBYVecInst<v2f64>;
+  def r128: SHLQBYInst<(outs GPRC:$rT), (ins GPRC:$rA, R32C:$rB),
+                       [(set GPRC:$rT, (SPUshlquad_l_bytes GPRC:$rA, R32C:$rB))]>;
+}
+
+defm SHLQBY: ShiftLeftQuadBytes;
+
+class SHLQBYIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111111100, OOL, IOL, "shlqbyi\t$rT, $rA, $val",
+            RotShiftQuad, pattern>;
+
+class SHLQBYIVecInst<ValueType vectype>:
+    SHLQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val),
+                [(set (vectype VECREG:$rT),
+                      (SPUshlquad_l_bytes (vectype VECREG:$rA), (i32 uimm7:$val)))]>;
+
+multiclass ShiftLeftQuadBytesImm
+{
+  def v16i8: SHLQBYIVecInst<v16i8>;
+  def v8i16: SHLQBYIVecInst<v8i16>;
+  def v4i32: SHLQBYIVecInst<v4i32>;
+  def v4f32: SHLQBYIVecInst<v4f32>;
+  def v2i64: SHLQBYIVecInst<v2i64>;
+  def v2f64: SHLQBYIVecInst<v2f64>;
+  def r128:  SHLQBYIInst<(outs GPRC:$rT), (ins GPRC:$rA, u7imm_i32:$val),
+                         [(set GPRC:$rT,
+                               (SPUshlquad_l_bytes GPRC:$rA, (i32 uimm7:$val)))]>;
+}
+
+defm SHLQBYI : ShiftLeftQuadBytesImm;
+
+class SHLQBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111001111, OOL, IOL, "shlqbybi\t$rT, $rA, $rB",
+           RotShiftQuad, pattern>;
+
+class SHLQBYBIVecInst<ValueType vectype>:
+    SHLQBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+class SHLQBYBIRegInst<RegisterClass rclass>:
+    SHLQBYBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                 [/* no pattern */]>;
+
+multiclass ShiftLeftQuadBytesBitCount
+{
+  def v16i8: SHLQBYBIVecInst<v16i8>;
+  def v8i16: SHLQBYBIVecInst<v8i16>;
+  def v4i32: SHLQBYBIVecInst<v4i32>;
+  def v4f32: SHLQBYBIVecInst<v4f32>;
+  def v2i64: SHLQBYBIVecInst<v2i64>;
+  def v2f64: SHLQBYBIVecInst<v2f64>;
+
+  def r128:  SHLQBYBIRegInst<GPRC>;
+}
+
+defm SHLQBYBI : ShiftLeftQuadBytesBitCount;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate halfword:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+class ROTHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111010000, OOL, IOL, "roth\t$rT, $rA, $rB",
+           RotShiftVec, pattern>;
+
+class ROTHVecInst<ValueType vectype>:
+    ROTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (SPUvec_rotl VECREG:$rA, (v8i16 VECREG:$rB)))]>;
+
+class ROTHRegInst<RegisterClass rclass>:
+    ROTHInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (rotl rclass:$rA, rclass:$rB))]>;
+
+multiclass RotateLeftHalfword
+{
+  def v8i16: ROTHVecInst<v8i16>;
+  def r16: ROTHRegInst<R16C>;
+}
+
+defm ROTH: RotateLeftHalfword;
+
+def ROTHr16_r32: ROTHInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+                          [(set R16C:$rT, (rotl R16C:$rA, R32C:$rB))]>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate halfword, immediate:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+class ROTHIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00111110000, OOL, IOL, "rothi\t$rT, $rA, $val",
+            RotShiftVec, pattern>;
+
+class ROTHIVecInst<ValueType vectype>:
+    ROTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val),
+              [(set (vectype VECREG:$rT),
+                    (SPUvec_rotl VECREG:$rA, (i16 uimm7:$val)))]>;
+
+multiclass RotateLeftHalfwordImm
+{
+  def v8i16: ROTHIVecInst<v8i16>;
+  def r16: ROTHIInst<(outs R16C:$rT), (ins R16C:$rA, u7imm:$val),
+                     [(set R16C:$rT, (rotl R16C:$rA, (i16 uimm7:$val)))]>;
+  def r16_r32: ROTHIInst<(outs R16C:$rT), (ins R16C:$rA, u7imm_i32:$val),
+                         [(set R16C:$rT, (rotl R16C:$rA, (i32 uimm7:$val)))]>;
+}
+
+defm ROTHI: RotateLeftHalfwordImm;
+
+def : Pat<(SPUvec_rotl (v8i16 VECREG:$rA), (i32 uimm7:$val)),
+          (ROTHIv8i16 VECREG:$rA, (TO_IMM16 imm:$val))>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate word:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00011010000, OOL, IOL, "rot\t$rT, $rA, $rB",
+           RotShiftVec, pattern>;
+
+class ROTVecInst<ValueType vectype>:
+    ROTInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+            [(set (vectype VECREG:$rT),
+                  (SPUvec_rotl (vectype VECREG:$rA), R32C:$rB))]>;
+
+class ROTRegInst<RegisterClass rclass>:
+    ROTInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+            [(set rclass:$rT,
+                  (rotl rclass:$rA, R32C:$rB))]>;
+
+multiclass RotateLeftWord
+{
+  def v4i32: ROTVecInst<v4i32>;
+  def r32:   ROTRegInst<R32C>;
+}
+
+defm ROT: RotateLeftWord;
+
+// The rotate amount is in the same bits whether we've got an 8-bit, 16-bit or
+// 32-bit register
+def ROTr32_r16_anyext:
+    ROTInst<(outs R32C:$rT), (ins R32C:$rA, R16C:$rB),
+            [(set R32C:$rT, (rotl R32C:$rA, (i32 (anyext R16C:$rB))))]>;
+
+def : Pat<(rotl R32C:$rA, (i32 (zext R16C:$rB))),
+          (ROTr32_r16_anyext R32C:$rA, R16C:$rB)>;
+
+def : Pat<(rotl R32C:$rA, (i32 (sext R16C:$rB))),
+          (ROTr32_r16_anyext R32C:$rA, R16C:$rB)>;
+
+def ROTr32_r8_anyext:
+    ROTInst<(outs R32C:$rT), (ins R32C:$rA, R8C:$rB),
+            [(set R32C:$rT, (rotl R32C:$rA, (i32 (anyext R8C:$rB))))]>;
+
+def : Pat<(rotl R32C:$rA, (i32 (zext R8C:$rB))),
+          (ROTr32_r8_anyext R32C:$rA, R8C:$rB)>;
+
+def : Pat<(rotl R32C:$rA, (i32 (sext R8C:$rB))),
+          (ROTr32_r8_anyext R32C:$rA, R8C:$rB)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate word, immediate
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00011110000, OOL, IOL, "roti\t$rT, $rA, $val",
+            RotShiftVec, pattern>;
+
+class ROTIVecInst<ValueType vectype, Operand optype, ValueType inttype, PatLeaf pred>:
+    ROTIInst<(outs VECREG:$rT), (ins VECREG:$rA, optype:$val),
+             [(set (vectype VECREG:$rT),
+                   (SPUvec_rotl (vectype VECREG:$rA), (inttype pred:$val)))]>;
+
+class ROTIRegInst<RegisterClass rclass, Operand optype, ValueType inttype, PatLeaf pred>:
+    ROTIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
+             [(set rclass:$rT, (rotl rclass:$rA, (inttype pred:$val)))]>;
+
+multiclass RotateLeftWordImm
+{
+  def v4i32: ROTIVecInst<v4i32, u7imm_i32, i32, uimm7>;
+  def v4i32_i16: ROTIVecInst<v4i32, u7imm, i16, uimm7>;
+  def v4i32_i8:  ROTIVecInst<v4i32, u7imm_i8, i8, uimm7>;
+
+  def r32:       ROTIRegInst<R32C, u7imm_i32, i32, uimm7>;
+  def r32_i16:   ROTIRegInst<R32C, u7imm, i16, uimm7>;
+  def r32_i8:    ROTIRegInst<R32C, u7imm_i8, i8, uimm7>;
+}
+
+defm ROTI : RotateLeftWordImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad by byte (count)
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQBYInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111011100, OOL, IOL, "rotqby\t$rT, $rA, $rB",
+           RotShiftQuad, pattern>;
+
+class ROTQBYGenInst<ValueType type, RegisterClass rc>:
+    ROTQBYInst<(outs rc:$rT), (ins rc:$rA, R32C:$rB),
+               [(set (type rc:$rT),
+                     (SPUrotbytes_left (type rc:$rA), R32C:$rB))]>;
+
+class ROTQBYVecInst<ValueType type>:
+    ROTQBYGenInst<type, VECREG>;
+
+multiclass RotateQuadLeftByBytes
+{
+  def v16i8: ROTQBYVecInst<v16i8>;
+  def v8i16: ROTQBYVecInst<v8i16>;
+  def v4i32: ROTQBYVecInst<v4i32>;
+  def v4f32: ROTQBYVecInst<v4f32>;
+  def v2i64: ROTQBYVecInst<v2i64>;
+  def v2f64: ROTQBYVecInst<v2f64>;
+  def i128:  ROTQBYGenInst<i128, GPRC>;
+}
+
+defm ROTQBY: RotateQuadLeftByBytes;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad by byte (count), immediate
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQBYIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00111111100, OOL, IOL, "rotqbyi\t$rT, $rA, $val",
+            RotShiftQuad, pattern>;
+
+class ROTQBYIGenInst<ValueType type, RegisterClass rclass>:
+    ROTQBYIInst<(outs rclass:$rT), (ins rclass:$rA, u7imm:$val),
+                [(set (type rclass:$rT),
+                      (SPUrotbytes_left (type rclass:$rA), (i16 uimm7:$val)))]>;
+
+class ROTQBYIVecInst<ValueType vectype>:
+    ROTQBYIGenInst<vectype, VECREG>;
+
+multiclass RotateQuadByBytesImm
+{
+  def v16i8: ROTQBYIVecInst<v16i8>;
+  def v8i16: ROTQBYIVecInst<v8i16>;
+  def v4i32: ROTQBYIVecInst<v4i32>;
+  def v4f32: ROTQBYIVecInst<v4f32>;
+  def v2i64: ROTQBYIVecInst<v2i64>;
+  def vfi64: ROTQBYIVecInst<v2f64>;
+  def i128:  ROTQBYIGenInst<i128, GPRC>;
+}
+
+defm ROTQBYI: RotateQuadByBytesImm;
+
+// See ROTQBY note above.
+class ROTQBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00110011100, OOL, IOL,
+      "rotqbybi\t$rT, $rA, $shift",
+      RotShiftQuad, pattern>;
+
+class ROTQBYBIVecInst<ValueType vectype, RegisterClass rclass>:
+    ROTQBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, rclass:$shift),
+      [(set (vectype VECREG:$rT),
+            (SPUrotbytes_left_bits (vectype VECREG:$rA), rclass:$shift))]>;
+
+multiclass RotateQuadByBytesByBitshift {
+  def v16i8_r32: ROTQBYBIVecInst<v16i8, R32C>;
+  def v8i16_r32: ROTQBYBIVecInst<v8i16, R32C>;
+  def v4i32_r32: ROTQBYBIVecInst<v4i32, R32C>;
+  def v2i64_r32: ROTQBYBIVecInst<v2i64, R32C>;
+}
+
+defm ROTQBYBI : RotateQuadByBytesByBitshift;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// See ROTQBY note above.
+//
+// Assume that the user of this instruction knows to shift the rotate count
+// into bit 29
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00011011100, OOL, IOL, "rotqbi\t$rT, $rA, $rB",
+           RotShiftQuad, pattern>;
+
+class ROTQBIVecInst<ValueType vectype>:
+    ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [/* no pattern yet */]>;
+
+class ROTQBIRegInst<RegisterClass rclass>:
+    ROTQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+               [/* no pattern yet */]>;
+
+multiclass RotateQuadByBitCount
+{
+  def v16i8: ROTQBIVecInst<v16i8>;
+  def v8i16: ROTQBIVecInst<v8i16>;
+  def v4i32: ROTQBIVecInst<v4i32>;
+  def v2i64: ROTQBIVecInst<v2i64>;
+
+  def r128:  ROTQBIRegInst<GPRC>;
+  def r64:   ROTQBIRegInst<R64C>;
+}
+
+defm ROTQBI: RotateQuadByBitCount;
+
+class ROTQBIIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00011111100, OOL, IOL, "rotqbii\t$rT, $rA, $val",
+            RotShiftQuad, pattern>;
+
+class ROTQBIIVecInst<ValueType vectype, Operand optype, ValueType inttype,
+                     PatLeaf pred>:
+    ROTQBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, optype:$val),
+                [/* no pattern yet */]>;
+
+class ROTQBIIRegInst<RegisterClass rclass, Operand optype, ValueType inttype,
+                     PatLeaf pred>:
+    ROTQBIIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
+                [/* no pattern yet */]>;
+
+multiclass RotateQuadByBitCountImm
+{
+  def v16i8: ROTQBIIVecInst<v16i8, u7imm_i32, i32, uimm7>;
+  def v8i16: ROTQBIIVecInst<v8i16, u7imm_i32, i32, uimm7>;
+  def v4i32: ROTQBIIVecInst<v4i32, u7imm_i32, i32, uimm7>;
+  def v2i64: ROTQBIIVecInst<v2i64, u7imm_i32, i32, uimm7>;
+
+  def r128:  ROTQBIIRegInst<GPRC, u7imm_i32, i32, uimm7>;
+  def r64:   ROTQBIIRegInst<R64C, u7imm_i32, i32, uimm7>;
+}
+
+defm ROTQBII : RotateQuadByBitCountImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// ROTHM v8i16 form:
+// NOTE(1): No vector rotate is generated by the C/C++ frontend (today),
+//          so this only matches a synthetically generated/lowered code
+//          fragment.
+// NOTE(2): $rB must be negated before the right rotate!
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTHMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10111010000, OOL, IOL, "rothm\t$rT, $rA, $rB",
+           RotShiftVec, pattern>;
+
+def ROTHMv8i16:
+    ROTHMInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+              [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (ROTHMv8i16 VECREG:$rA, (SFHIvec VECREG:$rB, 0))>;
+
+// ROTHM r16 form: Rotate 16-bit quantity to right, zero fill at the left
+// Note: This instruction doesn't match a pattern because rB must be negated
+// for the instruction to work. Thus, the pattern below the instruction!
+
+def ROTHMr16:
+    ROTHMInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+              [/* see patterns below - $rB must be negated! */]>;
+
+def : Pat<(srl R16C:$rA, R32C:$rB),
+          (ROTHMr16 R16C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(srl R16C:$rA, R16C:$rB),
+          (ROTHMr16 R16C:$rA,
+                    (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(srl R16C:$rA, R8C:$rB),
+          (ROTHMr16 R16C:$rA,
+                    (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB) ), 0))>;
+
+// ROTHMI v8i16 form: See the comment for ROTHM v8i16. The difference here is
+// that the immediate can be complemented, so that the user doesn't have to
+// worry about it.
+
+class ROTHMIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b10111110000, OOL, IOL, "rothmi\t$rT, $rA, $val",
+            RotShiftVec, pattern>;
+
+def ROTHMIv8i16:
+    ROTHMIInst<(outs VECREG:$rT), (ins VECREG:$rA, rothNeg7imm:$val),
+               [/* no pattern */]>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i32 imm:$val)),
+          (ROTHMIv8i16 VECREG:$rA, imm:$val)>;
+
+def: Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i16 imm:$val)),
+         (ROTHMIv8i16 VECREG:$rA, (TO_IMM32 imm:$val))>;
+
+def: Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i8 imm:$val)),
+         (ROTHMIv8i16 VECREG:$rA, (TO_IMM32 imm:$val))>;
+
+def ROTHMIr16:
+    ROTHMIInst<(outs R16C:$rT), (ins R16C:$rA, rothNeg7imm:$val),
+               [/* no pattern */]>;
+
+def: Pat<(srl R16C:$rA, (i32 uimm7:$val)),
+         (ROTHMIr16 R16C:$rA, uimm7:$val)>;
+
+def: Pat<(srl R16C:$rA, (i16 uimm7:$val)),
+         (ROTHMIr16 R16C:$rA, (TO_IMM32 uimm7:$val))>;
+
+def: Pat<(srl R16C:$rA, (i8 uimm7:$val)),
+         (ROTHMIr16 R16C:$rA, (TO_IMM32 uimm7:$val))>;
+
+// ROTM v4i32 form: See the ROTHM v8i16 comments.
+class ROTMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10011010000, OOL, IOL, "rotm\t$rT, $rA, $rB",
+           RotShiftVec, pattern>;
+
+def ROTMv4i32:
+    ROTMInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
+          (ROTMv4i32 VECREG:$rA, (SFIvec VECREG:$rB, 0))>;
+
+def ROTMr32:
+    ROTMInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+             [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(srl R32C:$rA, R32C:$rB),
+          (ROTMr32 R32C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(srl R32C:$rA, R16C:$rB),
+          (ROTMr32 R32C:$rA,
+                   (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(srl R32C:$rA, R8C:$rB),
+          (ROTMr32 R32C:$rA,
+                   (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+// ROTMI v4i32 form: See the comment for ROTHM v8i16.
+def ROTMIv4i32:
+    RI7Form<0b10011110000, (outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
+      "rotmi\t$rT, $rA, $val", RotShiftVec,
+      [(set (v4i32 VECREG:$rT),
+            (SPUvec_srl VECREG:$rA, (i32 uimm7:$val)))]>;
+
+def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), (i16 uimm7:$val)),
+          (ROTMIv4i32 VECREG:$rA, (TO_IMM32 uimm7:$val))>;
+
+def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), (i8 uimm7:$val)),
+          (ROTMIv4i32 VECREG:$rA, (TO_IMM32 uimm7:$val))>;
+
+// ROTMI r32 form: know how to complement the immediate value.
+def ROTMIr32:
+    RI7Form<0b10011110000, (outs R32C:$rT), (ins R32C:$rA, rotNeg7imm:$val),
+      "rotmi\t$rT, $rA, $val", RotShiftVec,
+      [(set R32C:$rT, (srl R32C:$rA, (i32 uimm7:$val)))]>;
+
+def : Pat<(srl R32C:$rA, (i16 imm:$val)),
+          (ROTMIr32 R32C:$rA, (TO_IMM32 uimm7:$val))>;
+
+def : Pat<(srl R32C:$rA, (i8 imm:$val)),
+          (ROTMIr32 R32C:$rA, (TO_IMM32 uimm7:$val))>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// ROTQMBY: This is a vector form merely so that when used in an
+// instruction pattern, type checking will succeed. This instruction assumes
+// that the user knew to negate $rB.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBYInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10111011100, OOL, IOL, "rotqmby\t$rT, $rA, $rB",
+           RotShiftQuad, pattern>;
+
+class ROTQMBYVecInst<ValueType vectype>:
+    ROTQMBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern, $rB must be negated */]>;
+
+class ROTQMBYRegInst<RegisterClass rclass>:
+    ROTQMBYInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+multiclass RotateQuadBytes
+{
+  def v16i8: ROTQMBYVecInst<v16i8>;
+  def v8i16: ROTQMBYVecInst<v8i16>;
+  def v4i32: ROTQMBYVecInst<v4i32>;
+  def v2i64: ROTQMBYVecInst<v2i64>;
+
+  def r128: ROTQMBYRegInst<GPRC>;
+  def r64:  ROTQMBYRegInst<R64C>;
+}
+
+defm ROTQMBY : RotateQuadBytes;
+
+def : Pat<(SPUsrl_bytes GPRC:$rA, R32C:$rB),
+          (ROTQMBYr128  GPRC:$rA, 
+                        (SFIr32 R32C:$rB, 0))>;
+
+class ROTQMBYIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val",
+            RotShiftQuad, pattern>;
+
+class ROTQMBYIVecInst<ValueType vectype>:
+    ROTQMBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
+                 [/* no pattern */]>;
+
+class ROTQMBYIRegInst<RegisterClass rclass, Operand optype, ValueType inttype,
+                      PatLeaf pred>:
+    ROTQMBYIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
+                 [/* no pattern */]>;
+
+// 128-bit zero extension form:
+class ROTQMBYIZExtInst<RegisterClass rclass, Operand optype, PatLeaf pred>:
+    ROTQMBYIInst<(outs GPRC:$rT), (ins rclass:$rA, optype:$val),
+                 [/* no pattern */]>;
+
+multiclass RotateQuadBytesImm
+{
+  def v16i8: ROTQMBYIVecInst<v16i8>;
+  def v8i16: ROTQMBYIVecInst<v8i16>;
+  def v4i32: ROTQMBYIVecInst<v4i32>;
+  def v2i64: ROTQMBYIVecInst<v2i64>;
+
+  def r128:  ROTQMBYIRegInst<GPRC, rotNeg7imm, i32, uimm7>;
+  def r64:   ROTQMBYIRegInst<R64C, rotNeg7imm, i32, uimm7>;
+  
+  def r128_zext_r8:  ROTQMBYIZExtInst<R8C, rotNeg7imm, uimm7>;
+  def r128_zext_r16: ROTQMBYIZExtInst<R16C, rotNeg7imm, uimm7>;
+  def r128_zext_r32: ROTQMBYIZExtInst<R32C, rotNeg7imm, uimm7>;
+  def r128_zext_r64: ROTQMBYIZExtInst<R64C, rotNeg7imm, uimm7>;
+}
+
+defm ROTQMBYI : RotateQuadBytesImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate right and mask by bit count
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10110011100, OOL, IOL, "rotqmbybi\t$rT, $rA, $rB",
+           RotShiftQuad, pattern>;
+
+class ROTQMBYBIVecInst<ValueType vectype>:
+    ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                  [/* no pattern, */]>;
+
+multiclass RotateMaskQuadByBitCount
+{
+  def v16i8: ROTQMBYBIVecInst<v16i8>;
+  def v8i16: ROTQMBYBIVecInst<v8i16>;
+  def v4i32: ROTQMBYBIVecInst<v4i32>;
+  def v2i64: ROTQMBYBIVecInst<v2i64>;
+  def r128: ROTQMBYBIInst<(outs GPRC:$rT), (ins GPRC:$rA, R32C:$rB),
+                           [/*no pattern*/]>;
+}
+
+defm ROTQMBYBI: RotateMaskQuadByBitCount;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad and mask by bits
+// Note that the rotate amount has to be negated
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10011011100, OOL, IOL, "rotqmbi\t$rT, $rA, $rB",
+           RotShiftQuad, pattern>;
+
+class ROTQMBIVecInst<ValueType vectype>:
+    ROTQMBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+class ROTQMBIRegInst<RegisterClass rclass>:
+    ROTQMBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+multiclass RotateMaskQuadByBits
+{
+  def v16i8: ROTQMBIVecInst<v16i8>;
+  def v8i16: ROTQMBIVecInst<v8i16>;
+  def v4i32: ROTQMBIVecInst<v4i32>;
+  def v2i64: ROTQMBIVecInst<v2i64>;
+
+  def r128:  ROTQMBIRegInst<GPRC>;
+  def r64:   ROTQMBIRegInst<R64C>;
+}
+
+defm ROTQMBI: RotateMaskQuadByBits;
+
+def : Pat<(srl GPRC:$rA, R32C:$rB),
+          (ROTQMBYBIr128 (ROTQMBIr128  GPRC:$rA, 
+                                       (SFIr32 R32C:$rB, 0)),
+                         (SFIr32 R32C:$rB, 0))>;
+
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad and mask by bits, immediate
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBIIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b10011111100, OOL, IOL, "rotqmbii\t$rT, $rA, $val",
+            RotShiftQuad, pattern>;
+
+class ROTQMBIIVecInst<ValueType vectype>:
+   ROTQMBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
+                 [/* no pattern */]>;
+
+class ROTQMBIIRegInst<RegisterClass rclass>:
+   ROTQMBIIInst<(outs rclass:$rT), (ins rclass:$rA, rotNeg7imm:$val),
+                 [/* no pattern */]>;
+
+multiclass RotateMaskQuadByBitsImm
+{
+  def v16i8: ROTQMBIIVecInst<v16i8>;
+  def v8i16: ROTQMBIIVecInst<v8i16>;
+  def v4i32: ROTQMBIIVecInst<v4i32>;
+  def v2i64: ROTQMBIIVecInst<v2i64>;
+
+  def r128:  ROTQMBIIRegInst<GPRC>;
+  def r64:   ROTQMBIIRegInst<R64C>;
+}
+
+defm ROTQMBII: RotateMaskQuadByBitsImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def ROTMAHv8i16:
+    RRForm<0b01111010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "rotmah\t$rT, $rA, $rB", RotShiftVec,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (ROTMAHv8i16 VECREG:$rA, (SFHIvec VECREG:$rB, 0))>;
+
+def ROTMAHr16:
+    RRForm<0b01111010000, (outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+      "rotmah\t$rT, $rA, $rB", RotShiftVec,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(sra R16C:$rA, R32C:$rB),
+          (ROTMAHr16 R16C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(sra R16C:$rA, R16C:$rB),
+          (ROTMAHr16 R16C:$rA,
+                     (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(sra R16C:$rA, R8C:$rB),
+          (ROTMAHr16 R16C:$rA,
+                     (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+def ROTMAHIv8i16:
+    RRForm<0b01111110000, (outs VECREG:$rT), (ins VECREG:$rA, rothNeg7imm:$val),
+      "rotmahi\t$rT, $rA, $val", RotShiftVec,
+      [(set (v8i16 VECREG:$rT),
+            (SPUvec_sra (v8i16 VECREG:$rA), (i32 uimm7:$val)))]>;
+
+def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (i16 uimm7:$val)),
+          (ROTMAHIv8i16 (v8i16 VECREG:$rA), (TO_IMM32 uimm7:$val))>;
+
+def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (i8 uimm7:$val)),
+          (ROTMAHIv8i16 (v8i16 VECREG:$rA), (TO_IMM32 uimm7:$val))>;
+
+def ROTMAHIr16:
+    RRForm<0b01111110000, (outs R16C:$rT), (ins R16C:$rA, rothNeg7imm_i16:$val),
+      "rotmahi\t$rT, $rA, $val", RotShiftVec,
+      [(set R16C:$rT, (sra R16C:$rA, (i16 uimm7:$val)))]>;
+
+def : Pat<(sra R16C:$rA, (i32 imm:$val)),
+          (ROTMAHIr16 R16C:$rA, (TO_IMM32 uimm7:$val))>;
+
+def : Pat<(sra R16C:$rA, (i8 imm:$val)),
+          (ROTMAHIr16 R16C:$rA, (TO_IMM32 uimm7:$val))>;
+
+def ROTMAv4i32:
+    RRForm<0b01011010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "rotma\t$rT, $rA, $rB", RotShiftVec,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
+          (ROTMAv4i32 VECREG:$rA, (SFIvec (v4i32 VECREG:$rB), 0))>;
+
+def ROTMAr32:
+    RRForm<0b01011010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+      "rotma\t$rT, $rA, $rB", RotShiftVec,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(sra R32C:$rA, R32C:$rB),
+          (ROTMAr32 R32C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(sra R32C:$rA, R16C:$rB),
+          (ROTMAr32 R32C:$rA,
+                    (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(sra R32C:$rA, R8C:$rB),
+          (ROTMAr32 R32C:$rA,
+                    (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+class ROTMAIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01011110000, OOL, IOL,
+      "rotmai\t$rT, $rA, $val",
+      RotShiftVec, pattern>;
+
+class ROTMAIVecInst<ValueType vectype, Operand intop, ValueType inttype>:
+    ROTMAIInst<(outs VECREG:$rT), (ins VECREG:$rA, intop:$val),
+      [(set (vectype VECREG:$rT),
+            (SPUvec_sra VECREG:$rA, (inttype uimm7:$val)))]>;
+
+class ROTMAIRegInst<RegisterClass rclass, Operand intop, ValueType inttype>:
+    ROTMAIInst<(outs rclass:$rT), (ins rclass:$rA, intop:$val),
+      [(set rclass:$rT, (sra rclass:$rA, (inttype uimm7:$val)))]>;
+
+multiclass RotateMaskAlgebraicImm {
+  def v2i64_i32 : ROTMAIVecInst<v2i64, rotNeg7imm, i32>;
+  def v4i32_i32 : ROTMAIVecInst<v4i32, rotNeg7imm, i32>;
+  def r64_i32 : ROTMAIRegInst<R64C, rotNeg7imm, i32>;
+  def r32_i32 : ROTMAIRegInst<R32C, rotNeg7imm, i32>;
+}
+
+defm ROTMAI : RotateMaskAlgebraicImm;
+
+//===----------------------------------------------------------------------===//
+// Branch and conditionals:
+//===----------------------------------------------------------------------===//
+
+let isTerminator = 1, isBarrier = 1 in {
+  // Halt If Equal (r32 preferred slot only, no vector form)
+  def HEQr32:
+    RRForm_3<0b00011011110, (outs), (ins R32C:$rA, R32C:$rB),
+      "heq\t$rA, $rB", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HEQIr32 :
+    RI10Form_2<0b11111110, (outs), (ins R32C:$rA, s10imm:$val),
+      "heqi\t$rA, $val", BranchResolv,
+      [/* no pattern to match */]>;
+
+  // HGT/HGTI: These instructions use signed arithmetic for the comparison,
+  // contrasting with HLGT/HLGTI, which use unsigned comparison:
+  def HGTr32:
+    RRForm_3<0b00011010010, (outs), (ins R32C:$rA, R32C:$rB),
+      "hgt\t$rA, $rB", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HGTIr32:
+    RI10Form_2<0b11110010, (outs), (ins R32C:$rA, s10imm:$val),
+      "hgti\t$rA, $val", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HLGTr32:
+    RRForm_3<0b00011011010, (outs), (ins R32C:$rA, R32C:$rB),
+      "hlgt\t$rA, $rB", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HLGTIr32:
+    RI10Form_2<0b11111010, (outs), (ins R32C:$rA, s10imm:$val),
+      "hlgti\t$rA, $val", BranchResolv,
+      [/* no pattern to match */]>;
+}
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Comparison operators for i8, i16 and i32:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class CEQBInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00001011110, OOL, IOL, "ceqb\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpEqualByte
+{
+  def v16i8 :
+    CEQBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      [(set (v16i8 VECREG:$rT), (seteq (v8i16 VECREG:$rA),
+                                       (v8i16 VECREG:$rB)))]>;
+
+  def r8 :
+    CEQBInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB),
+             [(set R8C:$rT, (seteq R8C:$rA, R8C:$rB))]>;
+}
+
+class CEQBIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b01111110, OOL, IOL, "ceqbi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpEqualByteImm
+{
+  def v16i8 :
+    CEQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm_i8:$val),
+              [(set (v16i8 VECREG:$rT), (seteq (v16i8 VECREG:$rA),
+                                               v16i8SExt8Imm:$val))]>;
+  def r8:
+    CEQBIInst<(outs R8C:$rT), (ins R8C:$rA, s10imm_i8:$val),
+             [(set R8C:$rT, (seteq R8C:$rA, immSExt8:$val))]>;
+}
+
+class CEQHInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00010011110, OOL, IOL, "ceqh\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpEqualHalfword
+{
+  def v8i16 : CEQHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [(set (v8i16 VECREG:$rT), (seteq (v8i16 VECREG:$rA),
+                                                        (v8i16 VECREG:$rB)))]>;
+
+  def r16 : CEQHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                     [(set R16C:$rT, (seteq R16C:$rA, R16C:$rB))]>;
+}
+
+class CEQHIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b10111110, OOL, IOL, "ceqhi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpEqualHalfwordImm
+{
+  def v8i16 : CEQHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                        [(set (v8i16 VECREG:$rT),
+                              (seteq (v8i16 VECREG:$rA),
+                                     (v8i16 v8i16SExt10Imm:$val)))]>;
+  def r16 : CEQHIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+                      [(set R16C:$rT, (seteq R16C:$rA, i16ImmSExt10:$val))]>;
+}
+
+class CEQInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00000011110, OOL, IOL, "ceq\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpEqualWord
+{
+  def v4i32 : CEQInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [(set (v4i32 VECREG:$rT),
+                            (seteq (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+  def r32 : CEQInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+                    [(set R32C:$rT, (seteq R32C:$rA, R32C:$rB))]>;
+}
+
+class CEQIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b00111110, OOL, IOL, "ceqi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpEqualWordImm
+{
+  def v4i32 : CEQIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (seteq (v4i32 VECREG:$rA),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def r32: CEQIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                    [(set R32C:$rT, (seteq R32C:$rA, i32ImmSExt10:$val))]>;
+}
+
+class CGTBInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00001010010, OOL, IOL, "cgtb\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpGtrByte
+{
+  def v16i8 :
+    CGTBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      [(set (v16i8 VECREG:$rT), (setgt (v8i16 VECREG:$rA),
+                                       (v8i16 VECREG:$rB)))]>;
+
+  def r8 :
+    CGTBInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB),
+             [(set R8C:$rT, (setgt R8C:$rA, R8C:$rB))]>;
+}
+
+class CGTBIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b01110010, OOL, IOL, "cgtbi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpGtrByteImm
+{
+  def v16i8 :
+    CGTBIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm_i8:$val),
+              [(set (v16i8 VECREG:$rT), (setgt (v16i8 VECREG:$rA),
+                                               v16i8SExt8Imm:$val))]>;
+  def r8:
+    CGTBIInst<(outs R8C:$rT), (ins R8C:$rA, s10imm_i8:$val),
+              [(set R8C:$rT, (setgt R8C:$rA, immSExt8:$val))]>;
+}
+
+class CGTHInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00010010010, OOL, IOL, "cgth\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpGtrHalfword
+{
+  def v8i16 : CGTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [(set (v8i16 VECREG:$rT), (setgt (v8i16 VECREG:$rA),
+                                                        (v8i16 VECREG:$rB)))]>;
+
+  def r16 : CGTHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                     [(set R16C:$rT, (setgt R16C:$rA, R16C:$rB))]>;
+}
+
+class CGTHIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b10110010, OOL, IOL, "cgthi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpGtrHalfwordImm
+{
+  def v8i16 : CGTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                        [(set (v8i16 VECREG:$rT),
+                              (setgt (v8i16 VECREG:$rA),
+                                     (v8i16 v8i16SExt10Imm:$val)))]>;
+  def r16 : CGTHIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+                      [(set R16C:$rT, (setgt R16C:$rA, i16ImmSExt10:$val))]>;
+}
+
+class CGTInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00000010010, OOL, IOL, "cgt\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpGtrWord
+{
+  def v4i32 : CGTInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [(set (v4i32 VECREG:$rT),
+                            (setgt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+  def r32 : CGTInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+                    [(set R32C:$rT, (setgt R32C:$rA, R32C:$rB))]>;
+}
+
+class CGTIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b00110010, OOL, IOL, "cgti\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpGtrWordImm
+{
+  def v4i32 : CGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setgt (v4i32 VECREG:$rA),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def r32: CGTIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                    [(set R32C:$rT, (setgt R32C:$rA, i32ImmSExt10:$val))]>;
+
+  // CGTIv4f32, CGTIf32: These are used in the f32 fdiv instruction sequence:
+  def v4f32: CGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setgt (v4i32 (bitconvert (v4f32 VECREG:$rA))),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def f32:   CGTIInst<(outs R32C:$rT), (ins R32FP:$rA, s10imm_i32:$val),
+                      [/* no pattern */]>;
+}
+
+class CLGTBInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00001011010, OOL, IOL, "clgtb\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpLGtrByte
+{
+  def v16i8 :
+    CLGTBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      [(set (v16i8 VECREG:$rT), (setugt (v8i16 VECREG:$rA),
+                                       (v8i16 VECREG:$rB)))]>;
+
+  def r8 :
+    CLGTBInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB),
+             [(set R8C:$rT, (setugt R8C:$rA, R8C:$rB))]>;
+}
+
+class CLGTBIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b01111010, OOL, IOL, "clgtbi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpLGtrByteImm
+{
+  def v16i8 :
+    CLGTBIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm_i8:$val),
+              [(set (v16i8 VECREG:$rT), (setugt (v16i8 VECREG:$rA),
+                                               v16i8SExt8Imm:$val))]>;
+  def r8:
+    CLGTBIInst<(outs R8C:$rT), (ins R8C:$rA, s10imm_i8:$val),
+             [(set R8C:$rT, (setugt R8C:$rA, immSExt8:$val))]>;
+}
+
+class CLGTHInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00010011010, OOL, IOL, "clgth\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpLGtrHalfword
+{
+  def v8i16 : CLGTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [(set (v8i16 VECREG:$rT), (setugt (v8i16 VECREG:$rA),
+                                                        (v8i16 VECREG:$rB)))]>;
+
+  def r16 : CLGTHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                     [(set R16C:$rT, (setugt R16C:$rA, R16C:$rB))]>;
+}
+
+class CLGTHIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b10111010, OOL, IOL, "clgthi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpLGtrHalfwordImm
+{
+  def v8i16 : CLGTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                         [(set (v8i16 VECREG:$rT),
+                               (setugt (v8i16 VECREG:$rA),
+                                       (v8i16 v8i16SExt10Imm:$val)))]>;
+  def r16 : CLGTHIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+                       [(set R16C:$rT, (setugt R16C:$rA, i16ImmSExt10:$val))]>;
+}
+
+class CLGTInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00000011010, OOL, IOL, "clgt\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpLGtrWord
+{
+  def v4i32 : CLGTInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [(set (v4i32 VECREG:$rT),
+                            (setugt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+  def r32 : CLGTInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+                     [(set R32C:$rT, (setugt R32C:$rA, R32C:$rB))]>;
+}
+
+class CLGTIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b00111010, OOL, IOL, "clgti\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpLGtrWordImm
+{
+  def v4i32 : CLGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setugt (v4i32 VECREG:$rA),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def r32: CLGTIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                     [(set R32C:$rT, (setugt R32C:$rA, i32ImmSExt10:$val))]>;
+}
+
+defm CEQB   : CmpEqualByte;
+defm CEQBI  : CmpEqualByteImm;
+defm CEQH   : CmpEqualHalfword;
+defm CEQHI  : CmpEqualHalfwordImm;
+defm CEQ    : CmpEqualWord;
+defm CEQI   : CmpEqualWordImm;
+defm CGTB   : CmpGtrByte;
+defm CGTBI  : CmpGtrByteImm;
+defm CGTH   : CmpGtrHalfword;
+defm CGTHI  : CmpGtrHalfwordImm;
+defm CGT    : CmpGtrWord;
+defm CGTI   : CmpGtrWordImm;
+defm CLGTB  : CmpLGtrByte;
+defm CLGTBI : CmpLGtrByteImm;
+defm CLGTH  : CmpLGtrHalfword;
+defm CLGTHI : CmpLGtrHalfwordImm;
+defm CLGT   : CmpLGtrWord;
+defm CLGTI  : CmpLGtrWordImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// For SETCC primitives not supported above (setlt, setle, setge, etc.)
+// define a pattern to generate the right code, as a binary operator
+// (in a manner of speaking.)
+//
+// Notes:
+// 1. This only matches the setcc set of conditionals. Special pattern
+//    matching is used for select conditionals.
+//
+// 2. The "DAG" versions of these classes is almost exclusively used for
+//    i64 comparisons. See the tblgen fundamentals documentation for what
+//    ".ResultInstrs[0]" means; see TargetSelectionDAG.td and the Pattern
+//    class for where ResultInstrs originates.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class SETCCNegCondReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                      SPUInstr xorinst, SPUInstr cmpare>:
+  Pat<(cond rclass:$rA, rclass:$rB),
+      (xorinst (cmpare rclass:$rA, rclass:$rB), (inttype -1))>;
+
+class SETCCNegCondImm<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                      PatLeaf immpred, SPUInstr xorinst, SPUInstr cmpare>:
+  Pat<(cond rclass:$rA, (inttype immpred:$imm)),
+      (xorinst (cmpare rclass:$rA, (inttype immpred:$imm)), (inttype -1))>;
+
+def : SETCCNegCondReg<setne, R8C, i8, XORBIr8,  CEQBr8>;
+def : SETCCNegCondImm<setne, R8C, i8, immSExt8, XORBIr8, CEQBIr8>;
+
+def : SETCCNegCondReg<setne, R16C, i16, XORHIr16,     CEQHr16>;
+def : SETCCNegCondImm<setne, R16C, i16, i16ImmSExt10, XORHIr16, CEQHIr16>;
+
+def : SETCCNegCondReg<setne, R32C, i32, XORIr32, CEQr32>;
+def : SETCCNegCondImm<setne, R32C, i32, i32ImmSExt10, XORIr32, CEQIr32>;
+
+class SETCCBinOpReg<PatFrag cond, RegisterClass rclass,
+                    SPUInstr binop, SPUInstr cmpOp1, SPUInstr cmpOp2>:
+    Pat<(cond rclass:$rA, rclass:$rB),
+        (binop (cmpOp1 rclass:$rA, rclass:$rB),
+               (cmpOp2 rclass:$rA, rclass:$rB))>;
+
+class SETCCBinOpImm<PatFrag cond, RegisterClass rclass, PatLeaf immpred,
+                    ValueType immtype,
+                    SPUInstr binop, SPUInstr cmpOp1, SPUInstr cmpOp2>:
+    Pat<(cond rclass:$rA, (immtype immpred:$imm)),
+        (binop (cmpOp1 rclass:$rA, (immtype immpred:$imm)),
+               (cmpOp2 rclass:$rA, (immtype immpred:$imm)))>;
+
+def : SETCCBinOpReg<setge, R8C, ORr8, CGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setge, R8C, immSExt8, i8, ORr8, CGTBIr8, CEQBIr8>;
+def : SETCCBinOpReg<setlt, R8C, NORr8, CGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setlt, R8C, immSExt8, i8, NORr8, CGTBIr8, CEQBIr8>;
+def : Pat<(setle R8C:$rA, R8C:$rB),
+          (XORBIr8 (CGTBr8 R8C:$rA, R8C:$rB), 0xff)>;
+def :  Pat<(setle R8C:$rA, immU8:$imm),
+           (XORBIr8 (CGTBIr8 R8C:$rA, immU8:$imm), 0xff)>;
+
+def : SETCCBinOpReg<setge, R16C, ORr16, CGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setge, R16C, i16ImmSExt10, i16,
+                    ORr16, CGTHIr16, CEQHIr16>;
+def : SETCCBinOpReg<setlt, R16C, NORr16, CGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setlt, R16C, i16ImmSExt10, i16, NORr16, CGTHIr16, CEQHIr16>;
+def : Pat<(setle R16C:$rA, R16C:$rB),
+          (XORHIr16 (CGTHr16 R16C:$rA, R16C:$rB), 0xffff)>;
+def : Pat<(setle R16C:$rA, i16ImmSExt10:$imm),
+          (XORHIr16 (CGTHIr16 R16C:$rA, i16ImmSExt10:$imm), 0xffff)>;
+
+def : SETCCBinOpReg<setge, R32C, ORr32, CGTr32, CEQr32>;
+def : SETCCBinOpImm<setge, R32C, i32ImmSExt10, i32,
+                    ORr32, CGTIr32, CEQIr32>;
+def : SETCCBinOpReg<setlt, R32C, NORr32, CGTr32, CEQr32>;
+def : SETCCBinOpImm<setlt, R32C, i32ImmSExt10, i32, NORr32, CGTIr32, CEQIr32>;
+def : Pat<(setle R32C:$rA, R32C:$rB),
+          (XORIr32 (CGTr32 R32C:$rA, R32C:$rB), 0xffffffff)>;
+def : Pat<(setle R32C:$rA, i32ImmSExt10:$imm),
+          (XORIr32 (CGTIr32 R32C:$rA, i32ImmSExt10:$imm), 0xffffffff)>;
+
+def : SETCCBinOpReg<setuge, R8C, ORr8, CLGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setuge, R8C, immSExt8, i8, ORr8, CLGTBIr8, CEQBIr8>;
+def : SETCCBinOpReg<setult, R8C, NORr8, CLGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setult, R8C, immSExt8, i8, NORr8, CLGTBIr8, CEQBIr8>;
+def : Pat<(setule R8C:$rA, R8C:$rB),
+          (XORBIr8 (CLGTBr8 R8C:$rA, R8C:$rB), 0xff)>;
+def :  Pat<(setule R8C:$rA, immU8:$imm),
+           (XORBIr8 (CLGTBIr8 R8C:$rA, immU8:$imm), 0xff)>;
+
+def : SETCCBinOpReg<setuge, R16C, ORr16, CLGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setuge, R16C, i16ImmSExt10, i16,
+                    ORr16, CLGTHIr16, CEQHIr16>;
+def : SETCCBinOpReg<setult, R16C, NORr16, CLGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setult, R16C, i16ImmSExt10, i16, NORr16,
+                    CLGTHIr16, CEQHIr16>;
+def : Pat<(setule R16C:$rA, R16C:$rB),
+          (XORHIr16 (CLGTHr16 R16C:$rA, R16C:$rB), 0xffff)>;
+def :  Pat<(setule R16C:$rA, i16ImmSExt10:$imm),
+           (XORHIr16 (CLGTHIr16 R16C:$rA, i16ImmSExt10:$imm), 0xffff)>;
+
+def : SETCCBinOpReg<setuge, R32C, ORr32, CLGTr32, CEQr32>;
+def : SETCCBinOpImm<setuge, R32C, i32ImmSExt10, i32,
+                    ORr32, CLGTIr32, CEQIr32>;
+def : SETCCBinOpReg<setult, R32C, NORr32, CLGTr32, CEQr32>;
+def : SETCCBinOpImm<setult, R32C, i32ImmSExt10, i32, NORr32, CLGTIr32, CEQIr32>;
+def : Pat<(setule R32C:$rA, R32C:$rB),
+          (XORIr32 (CLGTr32 R32C:$rA, R32C:$rB), 0xffffffff)>;
+def : Pat<(setule R32C:$rA, i32ImmSExt10:$imm),
+          (XORIr32 (CLGTIr32 R32C:$rA, i32ImmSExt10:$imm), 0xffffffff)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// select conditional patterns:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class SELECTNegCondReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                       SPUInstr selinstr, SPUInstr cmpare>:
+  Pat<(select (inttype (cond rclass:$rA, rclass:$rB)),
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rTrue, rclass:$rFalse,
+                (cmpare rclass:$rA, rclass:$rB))>;
+
+class SELECTNegCondImm<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                       PatLeaf immpred, SPUInstr selinstr, SPUInstr cmpare>:
+  Pat<(select (inttype (cond rclass:$rA, immpred:$imm)),
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rTrue, rclass:$rFalse,
+                (cmpare rclass:$rA, immpred:$imm))>;
+
+def : SELECTNegCondReg<setne, R8C, i8, SELBr8, CEQBr8>;
+def : SELECTNegCondImm<setne, R8C, i8, immSExt8, SELBr8, CEQBIr8>;
+def : SELECTNegCondReg<setle, R8C, i8, SELBr8, CGTBr8>;
+def : SELECTNegCondImm<setle, R8C, i8, immSExt8, SELBr8, CGTBr8>;
+def : SELECTNegCondReg<setule, R8C, i8, SELBr8, CLGTBr8>;
+def : SELECTNegCondImm<setule, R8C, i8, immU8, SELBr8, CLGTBIr8>;
+
+def : SELECTNegCondReg<setne, R16C, i16, SELBr16, CEQHr16>;
+def : SELECTNegCondImm<setne, R16C, i16, i16ImmSExt10, SELBr16, CEQHIr16>;
+def : SELECTNegCondReg<setle, R16C, i16, SELBr16, CGTHr16>;
+def : SELECTNegCondImm<setle, R16C, i16, i16ImmSExt10, SELBr16, CGTHIr16>;
+def : SELECTNegCondReg<setule, R16C, i16, SELBr16, CLGTHr16>;
+def : SELECTNegCondImm<setule, R16C, i16, i16ImmSExt10, SELBr16, CLGTHIr16>;
+
+def : SELECTNegCondReg<setne, R32C, i32, SELBr32, CEQr32>;
+def : SELECTNegCondImm<setne, R32C, i32, i32ImmSExt10, SELBr32, CEQIr32>;
+def : SELECTNegCondReg<setle, R32C, i32, SELBr32, CGTr32>;
+def : SELECTNegCondImm<setle, R32C, i32, i32ImmSExt10, SELBr32, CGTIr32>;
+def : SELECTNegCondReg<setule, R32C, i32, SELBr32, CLGTr32>;
+def : SELECTNegCondImm<setule, R32C, i32, i32ImmSExt10, SELBr32, CLGTIr32>;
+
+class SELECTBinOpReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                     SPUInstr selinstr, SPUInstr binop, SPUInstr cmpOp1,
+                     SPUInstr cmpOp2>:
+  Pat<(select (inttype (cond rclass:$rA, rclass:$rB)),
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rFalse, rclass:$rTrue,
+                (binop (cmpOp1 rclass:$rA, rclass:$rB),
+                       (cmpOp2 rclass:$rA, rclass:$rB)))>;
+
+class SELECTBinOpImm<PatFrag cond, RegisterClass rclass, PatLeaf immpred,
+                     ValueType inttype,
+                     SPUInstr selinstr, SPUInstr binop, SPUInstr cmpOp1,
+                     SPUInstr cmpOp2>:
+    Pat<(select (inttype (cond rclass:$rA, (inttype immpred:$imm))),
+                rclass:$rTrue, rclass:$rFalse),
+        (selinstr rclass:$rFalse, rclass:$rTrue,
+                  (binop (cmpOp1 rclass:$rA, (inttype immpred:$imm)),
+                         (cmpOp2 rclass:$rA, (inttype immpred:$imm))))>;
+
+def : SELECTBinOpReg<setge, R8C, i8, SELBr8, ORr8, CGTBr8, CEQBr8>;
+def : SELECTBinOpImm<setge, R8C, immSExt8, i8,
+                     SELBr8, ORr8, CGTBIr8, CEQBIr8>;
+
+def : SELECTBinOpReg<setge, R16C, i16, SELBr16, ORr16, CGTHr16, CEQHr16>;
+def : SELECTBinOpImm<setge, R16C, i16ImmSExt10, i16,
+                     SELBr16, ORr16, CGTHIr16, CEQHIr16>;
+
+def : SELECTBinOpReg<setge, R32C, i32, SELBr32, ORr32, CGTr32, CEQr32>;
+def : SELECTBinOpImm<setge, R32C, i32ImmSExt10, i32,
+                     SELBr32, ORr32, CGTIr32, CEQIr32>;
+
+def : SELECTBinOpReg<setuge, R8C, i8, SELBr8, ORr8, CLGTBr8, CEQBr8>;
+def : SELECTBinOpImm<setuge, R8C, immSExt8, i8,
+                     SELBr8, ORr8, CLGTBIr8, CEQBIr8>;
+
+def : SELECTBinOpReg<setuge, R16C, i16, SELBr16, ORr16, CLGTHr16, CEQHr16>;
+def : SELECTBinOpImm<setuge, R16C, i16ImmUns10, i16,
+                     SELBr16, ORr16, CLGTHIr16, CEQHIr16>;
+
+def : SELECTBinOpReg<setuge, R32C, i32, SELBr32, ORr32, CLGTr32, CEQr32>;
+def : SELECTBinOpImm<setuge, R32C, i32ImmUns10, i32,
+                     SELBr32, ORr32, CLGTIr32, CEQIr32>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+let isCall = 1,
+  // All calls clobber the non-callee-saved registers:
+  Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9,
+          R10,R11,R12,R13,R14,R15,R16,R17,R18,R19,
+          R20,R21,R22,R23,R24,R25,R26,R27,R28,R29,
+          R30,R31,R32,R33,R34,R35,R36,R37,R38,R39,
+          R40,R41,R42,R43,R44,R45,R46,R47,R48,R49,
+          R50,R51,R52,R53,R54,R55,R56,R57,R58,R59,
+          R60,R61,R62,R63,R64,R65,R66,R67,R68,R69,
+          R70,R71,R72,R73,R74,R75,R76,R77,R78,R79],
+  // All of these instructions use $lr (aka $0)
+  Uses = [R0]  in {
+  // Branch relative and set link: Used if we actually know that the target
+  // is within [-32768, 32767] bytes of the target
+  def BRSL:
+    BranchSetLink<0b011001100, (outs), (ins relcalltarget:$func, variable_ops),
+      "brsl\t$$lr, $func",
+      [(SPUcall (SPUpcrel tglobaladdr:$func, 0))]>;
+
+  // Branch absolute and set link: Used if we actually know that the target
+  // is an absolute address
+  def BRASL:
+    BranchSetLink<0b011001100, (outs), (ins calltarget:$func, variable_ops),
+      "brasl\t$$lr, $func",
+      [(SPUcall (SPUaform tglobaladdr:$func, 0))]>;
+
+  // Branch indirect and set link if external data. These instructions are not
+  // actually generated, matched by an intrinsic:
+  def BISLED_00: BISLEDForm<0b11, "bisled\t$$lr, $func", [/* empty pattern */]>;
+  def BISLED_E0: BISLEDForm<0b10, "bisled\t$$lr, $func", [/* empty pattern */]>;
+  def BISLED_0D: BISLEDForm<0b01, "bisled\t$$lr, $func", [/* empty pattern */]>;
+  def BISLED_ED: BISLEDForm<0b00, "bisled\t$$lr, $func", [/* empty pattern */]>;
+
+  // Branch indirect and set link. This is the "X-form" address version of a
+  // function call
+  def BISL:
+    BIForm<0b10010101100, "bisl\t$$lr, $func", [(SPUcall R32C:$func)]>;
+}
+
+// Support calls to external symbols:      
+def : Pat<(SPUcall (SPUpcrel texternalsym:$func, 0)),
+          (BRSL texternalsym:$func)>;
+      
+def : Pat<(SPUcall (SPUaform texternalsym:$func, 0)),
+          (BRASL texternalsym:$func)>;
+
+// Unconditional branches:
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+  let isBarrier = 1 in {
+    def BR :
+      UncondBranch<0b001001100, (outs), (ins brtarget:$dest),
+        "br\t$dest",
+        [(br bb:$dest)]>;
+
+    // Unconditional, absolute address branch
+    def BRA:
+      UncondBranch<0b001100000, (outs), (ins brtarget:$dest),
+        "bra\t$dest",
+        [/* no pattern */]>;
+
+    // Indirect branch
+    let isIndirectBranch = 1 in {
+      def BI:
+        BIForm<0b00010101100, "bi\t$func", [(brind R32C:$func)]>;
+    }
+  }
+
+  // Conditional branches:
+  class BRNZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b010000100, (outs), IOL, "brnz\t$rCond,$dest",
+             BranchResolv, pattern>;
+
+  class BRNZRegInst<RegisterClass rclass>:
+    BRNZInst<(ins rclass:$rCond, brtarget:$dest),
+             [(brcond rclass:$rCond, bb:$dest)]>;
+
+  class BRNZVecInst<ValueType vectype>:
+    BRNZInst<(ins VECREG:$rCond, brtarget:$dest),
+             [(brcond (vectype VECREG:$rCond), bb:$dest)]>;
+
+  multiclass BranchNotZero {
+    def v4i32 : BRNZVecInst<v4i32>;
+    def r32   : BRNZRegInst<R32C>;
+  }
+
+  defm BRNZ : BranchNotZero;
+
+  class BRZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b000000100, (outs), IOL, "brz\t$rT,$dest",
+             BranchResolv, pattern>;
+
+  class BRZRegInst<RegisterClass rclass>:
+    BRZInst<(ins rclass:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  class BRZVecInst<ValueType vectype>:
+    BRZInst<(ins VECREG:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchZero {
+    def v4i32: BRZVecInst<v4i32>;
+    def r32:   BRZRegInst<R32C>;
+  }
+
+  defm BRZ: BranchZero;
+
+  // Note: LLVM doesn't do branch conditional, indirect. Otherwise these would
+  // be useful:
+  /*
+  class BINZInst<dag IOL, list<dag> pattern>:
+   BICondForm<0b10010100100, (outs), IOL, "binz\t$rA, $dest", pattern>;
+
+  class BINZRegInst<RegisterClass rclass>:
+    BINZInst<(ins rclass:$rA, brtarget:$dest),
+             [(brcond rclass:$rA, R32C:$dest)]>;
+
+  class BINZVecInst<ValueType vectype>:
+    BINZInst<(ins VECREG:$rA, R32C:$dest),
+             [(brcond (vectype VECREG:$rA), R32C:$dest)]>;
+
+  multiclass BranchNotZeroIndirect {
+    def v4i32: BINZVecInst<v4i32>;
+    def r32:   BINZRegInst<R32C>;
+  }
+
+  defm BINZ: BranchNotZeroIndirect;
+
+  class BIZInst<dag IOL, list<dag> pattern>:
+    BICondForm<0b00010100100, (outs), IOL, "biz\t$rA, $func", pattern>;
+
+  class BIZRegInst<RegisterClass rclass>:
+    BIZInst<(ins rclass:$rA, R32C:$func), [/* no pattern */]>;
+
+  class BIZVecInst<ValueType vectype>:
+    BIZInst<(ins VECREG:$rA, R32C:$func), [/* no pattern */]>;
+
+  multiclass BranchZeroIndirect {
+    def v4i32: BIZVecInst<v4i32>;
+    def r32:   BIZRegInst<R32C>;
+  }
+
+  defm BIZ: BranchZeroIndirect;
+  */
+
+  class BRHNZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b011000100, (outs), IOL, "brhnz\t$rCond,$dest", BranchResolv,
+             pattern>;
+
+  class BRHNZRegInst<RegisterClass rclass>:
+    BRHNZInst<(ins rclass:$rCond, brtarget:$dest),
+              [(brcond rclass:$rCond, bb:$dest)]>;
+
+  class BRHNZVecInst<ValueType vectype>:
+    BRHNZInst<(ins VECREG:$rCond, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchNotZeroHalfword {
+    def v8i16: BRHNZVecInst<v8i16>;
+    def r16:   BRHNZRegInst<R16C>;
+  }
+
+  defm BRHNZ: BranchNotZeroHalfword;
+
+  class BRHZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b001000100, (outs), IOL, "brhz\t$rT,$dest", BranchResolv,
+             pattern>;
+
+  class BRHZRegInst<RegisterClass rclass>:
+    BRHZInst<(ins rclass:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  class BRHZVecInst<ValueType vectype>:
+    BRHZInst<(ins VECREG:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchZeroHalfword {
+    def v8i16: BRHZVecInst<v8i16>;
+    def r16:   BRHZRegInst<R16C>;
+  }
+
+  defm BRHZ: BranchZeroHalfword;
+}
+
+//===----------------------------------------------------------------------===//
+// setcc and brcond patterns:
+//===----------------------------------------------------------------------===//
+
+def : Pat<(brcond (i16 (seteq R16C:$rA, 0)), bb:$dest),
+          (BRHZr16 R16C:$rA, bb:$dest)>;
+def : Pat<(brcond (i16 (setne R16C:$rA, 0)), bb:$dest),
+          (BRHNZr16 R16C:$rA, bb:$dest)>;
+
+def : Pat<(brcond (i32 (seteq R32C:$rA, 0)), bb:$dest),
+          (BRZr32 R32C:$rA, bb:$dest)>;
+def : Pat<(brcond (i32 (setne R32C:$rA, 0)), bb:$dest),
+          (BRNZr32 R32C:$rA, bb:$dest)>;
+
+multiclass BranchCondEQ<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
+{
+  def r16imm: Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                  (brinst16 (CEQHIr16 R16C:$rA, i16ImmSExt10:$val), bb:$dest)>;
+
+  def r16 : Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+                (brinst16 (CEQHr16 R16C:$rA, R16:$rB), bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (CEQIr32 R32C:$rA, i32ImmSExt10:$val), bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (CEQr32 R32C:$rA, R32C:$rB), bb:$dest)>;
+}
+
+defm BRCONDeq : BranchCondEQ<seteq, BRHNZr16, BRNZr32>;
+defm BRCONDne : BranchCondEQ<setne, BRHZr16, BRZr32>;
+
+multiclass BranchCondLGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
+{
+  def r16imm : Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                   (brinst16 (CLGTHIr16 R16C:$rA, i16ImmSExt10:$val), bb:$dest)>;
+
+  def r16 : Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+                (brinst16 (CLGTHr16 R16C:$rA, R16:$rB), bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (CLGTIr32 R32C:$rA, i32ImmSExt10:$val), bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (CLGTr32 R32C:$rA, R32C:$rB), bb:$dest)>;
+}
+
+defm BRCONDugt : BranchCondLGT<setugt, BRHNZr16, BRNZr32>;
+defm BRCONDule : BranchCondLGT<setule, BRHZr16, BRZr32>;
+
+multiclass BranchCondLGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
+                           SPUInstr orinst32, SPUInstr brinst32>
+{
+  def r16imm: Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                  (brinst16 (orinst16 (CLGTHIr16 R16C:$rA, i16ImmSExt10:$val),
+                                      (CEQHIr16 R16C:$rA, i16ImmSExt10:$val)),
+                            bb:$dest)>;
+
+  def r16: Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+               (brinst16 (orinst16 (CLGTHr16 R16C:$rA, R16:$rB),
+                                   (CEQHr16 R16C:$rA, R16:$rB)),
+                         bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (orinst32 (CLGTIr32 R32C:$rA, i32ImmSExt10:$val),
+                                       (CEQIr32 R32C:$rA, i32ImmSExt10:$val)),
+                             bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (orinst32 (CLGTr32 R32C:$rA, R32C:$rB),
+                                    (CEQr32 R32C:$rA, R32C:$rB)),
+                          bb:$dest)>;
+}
+
+defm BRCONDuge : BranchCondLGTEQ<setuge, ORr16, BRHNZr16, ORr32, BRNZr32>;
+defm BRCONDult : BranchCondLGTEQ<setult, ORr16, BRHZr16, ORr32, BRZr32>;
+
+multiclass BranchCondGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
+{
+  def r16imm : Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                   (brinst16 (CGTHIr16 R16C:$rA, i16ImmSExt10:$val), bb:$dest)>;
+
+  def r16 : Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+                (brinst16 (CGTHr16 R16C:$rA, R16:$rB), bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (CGTIr32 R32C:$rA, i32ImmSExt10:$val), bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (CGTr32 R32C:$rA, R32C:$rB), bb:$dest)>;
+}
+
+defm BRCONDgt : BranchCondGT<setgt, BRHNZr16, BRNZr32>;
+defm BRCONDle : BranchCondGT<setle, BRHZr16, BRZr32>;
+
+multiclass BranchCondGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
+                          SPUInstr orinst32, SPUInstr brinst32>
+{
+  def r16imm: Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                  (brinst16 (orinst16 (CGTHIr16 R16C:$rA, i16ImmSExt10:$val),
+                                      (CEQHIr16 R16C:$rA, i16ImmSExt10:$val)),
+                            bb:$dest)>;
+
+  def r16: Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+               (brinst16 (orinst16 (CGTHr16 R16C:$rA, R16:$rB),
+                                   (CEQHr16 R16C:$rA, R16:$rB)),
+                         bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (orinst32 (CGTIr32 R32C:$rA, i32ImmSExt10:$val),
+                                       (CEQIr32 R32C:$rA, i32ImmSExt10:$val)),
+                             bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (orinst32 (CGTr32 R32C:$rA, R32C:$rB),
+                                    (CEQr32 R32C:$rA, R32C:$rB)),
+                          bb:$dest)>;
+}
+
+defm BRCONDge : BranchCondGTEQ<setge, ORr16, BRHNZr16, ORr32, BRNZr32>;
+defm BRCONDlt : BranchCondGTEQ<setlt, ORr16, BRHZr16, ORr32, BRZr32>;
+
+let isTerminator = 1, isBarrier = 1 in {
+  let isReturn = 1 in {
+    def RET:
+        RETForm<"bi\t$$lr", [(retflag)]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Single precision floating point instructions
+//===----------------------------------------------------------------------===//
+
+class FAInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01011000100, OOL, IOL, "fa\t$rT, $rA, $rB",
+           SPrecFP, pattern>;
+
+class FAVecInst<ValueType vectype>:
+    FAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+
+multiclass SFPAdd
+{
+  def v4f32: FAVecInst<v4f32>;
+  def f32:   FAInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fadd R32FP:$rA, R32FP:$rB))]>;
+}
+
+defm FA : SFPAdd;
+
+class FSInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01011000100, OOL, IOL, "fs\t$rT, $rA, $rB",
+           SPrecFP, pattern>;
+
+class FSVecInst<ValueType vectype>:
+    FSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [(set (vectype VECREG:$rT),
+                 (fsub (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+
+multiclass SFPSub
+{
+  def v4f32: FSVecInst<v4f32>;
+  def f32:   FSInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fsub R32FP:$rA, R32FP:$rB))]>;
+}
+
+defm FS : SFPSub;
+
+class FMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01100011010, OOL, IOL,
+      "fm\t$rT, $rA, $rB", SPrecFP,
+      pattern>;
+
+class FMVecInst<ValueType type>:
+    FMInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [(set (type VECREG:$rT),
+                 (fmul (type VECREG:$rA), (type VECREG:$rB)))]>;
+
+multiclass SFPMul
+{
+  def v4f32: FMVecInst<v4f32>;
+  def f32:   FMInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                     [(set R32FP:$rT, (fmul R32FP:$rA, R32FP:$rB))]>; 
+}
+
+defm FM : SFPMul;
+
+// Floating point multiply and add
+// e.g. d = c + (a * b)
+def FMAv4f32:
+    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fadd (v4f32 VECREG:$rC),
+                  (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB))))]>;
+
+def FMAf32:
+    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT, (fadd R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
+
+// FP multiply and subtract
+// Subtracts value in rC from product
+// res = a * b - c
+def FMSv4f32 :
+    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fsub (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
+                  (v4f32 VECREG:$rC)))]>;
+
+def FMSf32 :
+    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT,
+            (fsub (fmul R32FP:$rA, R32FP:$rB), R32FP:$rC))]>;
+
+// Floating Negative Mulitply and Subtract
+// Subtracts product from value in rC
+// res = fneg(fms a b c)
+//     = - (a * b - c)
+//     = c - a * b
+// NOTE: subtraction order
+// fsub a b = a - b
+// fs a b = b - a?
+def FNMSf32 :
+    RRRForm<0b1101, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT, (fsub R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
+
+def FNMSv4f32 :
+    RRRForm<0b1101, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fsub (v4f32 VECREG:$rC),
+                  (fmul (v4f32 VECREG:$rA),
+                        (v4f32 VECREG:$rB))))]>;
+
+
+
+
+// Floating point reciprocal estimate
+
+class FRESTInst<dag OOL, dag IOL>:
+  RRForm_1<0b00110111000, OOL, IOL,
+           "frest\t$rT, $rA", SPrecFP,
+           [/* no pattern */]>;
+
+def FRESTv4f32 :
+    FRESTInst<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
+def FRESTf32 :
+    FRESTInst<(outs R32FP:$rT), (ins R32FP:$rA)>;
+
+// Floating point interpolate (used in conjunction with reciprocal estimate)
+def FIv4f32 :
+    RRForm<0b00101011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "fi\t$rT, $rA, $rB", SPrecFP,
+      [/* no pattern */]>;
+
+def FIf32 :
+    RRForm<0b00101011110, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fi\t$rT, $rA, $rB", SPrecFP,
+      [/* no pattern */]>;
+
+//--------------------------------------------------------------------------
+// Basic single precision floating point comparisons:
+//
+// Note: There is no support on SPU for single precision NaN. Consequently,
+// ordered and unordered comparisons are the same.
+//--------------------------------------------------------------------------
+
+def FCEQf32 :
+    RRForm<0b01000011110, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fceq\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setueq R32FP:$rA, R32FP:$rB))]>;
+
+def : Pat<(setoeq R32FP:$rA, R32FP:$rB),
+          (FCEQf32 R32FP:$rA, R32FP:$rB)>;
+
+def FCMEQf32 :
+    RRForm<0b01010011110, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fcmeq\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setueq (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+
+def : Pat<(setoeq (fabs R32FP:$rA), (fabs R32FP:$rB)),
+          (FCMEQf32 R32FP:$rA, R32FP:$rB)>;
+
+def FCGTf32 :
+    RRForm<0b01000011010, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fcgt\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setugt R32FP:$rA, R32FP:$rB))]>;
+
+def : Pat<(setogt R32FP:$rA, R32FP:$rB),
+          (FCGTf32 R32FP:$rA, R32FP:$rB)>;
+
+def FCMGTf32 :
+    RRForm<0b01010011010, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fcmgt\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setugt (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+
+def : Pat<(setogt (fabs R32FP:$rA), (fabs R32FP:$rB)),
+          (FCMGTf32 R32FP:$rA, R32FP:$rB)>;
+
+//--------------------------------------------------------------------------
+// Single precision floating point comparisons and SETCC equivalents:
+//--------------------------------------------------------------------------
+
+def : SETCCNegCondReg<setune, R32FP, i32, XORIr32, FCEQf32>;
+def : SETCCNegCondReg<setone, R32FP, i32, XORIr32, FCEQf32>;
+
+def : SETCCBinOpReg<setuge, R32FP, ORr32, FCGTf32, FCEQf32>;
+def : SETCCBinOpReg<setoge, R32FP, ORr32, FCGTf32, FCEQf32>;
+
+def : SETCCBinOpReg<setult, R32FP, NORr32, FCGTf32, FCEQf32>;
+def : SETCCBinOpReg<setolt, R32FP, NORr32, FCGTf32, FCEQf32>;
+
+def : Pat<(setule R32FP:$rA, R32FP:$rB),
+          (XORIr32 (FCGTf32 R32FP:$rA, R32FP:$rB), 0xffffffff)>;
+def : Pat<(setole R32FP:$rA, R32FP:$rB),
+          (XORIr32 (FCGTf32 R32FP:$rA, R32FP:$rB), 0xffffffff)>;
+
+// FP Status and Control Register Write
+// Why isn't rT a don't care in the ISA?
+// Should we create a special RRForm_3 for this guy and zero out the rT?
+def FSCRWf32 :
+    RRForm_1<0b01011101110, (outs R32FP:$rT), (ins R32FP:$rA),
+      "fscrwr\t$rA", SPrecFP,
+      [/* This instruction requires an intrinsic. Note: rT is unused. */]>;
+
+// FP Status and Control Register Read
+def FSCRRf32 :
+    RRForm_2<0b01011101110, (outs R32FP:$rT), (ins),
+      "fscrrd\t$rT", SPrecFP,
+      [/* This instruction requires an intrinsic */]>;
+
+// llvm instruction space
+// How do these map onto cell instructions?
+// fdiv rA rB
+//   frest rC rB        # c = 1/b (both lines)
+//   fi rC rB rC
+//   fm rD rA rC        # d = a * 1/b
+//   fnms rB rD rB rA # b = - (d * b - a) --should == 0 in a perfect world
+//   fma rB rB rC rD            # b = b * c + d
+//                              = -(d *b -a) * c + d
+//                              = a * c - c ( a *b *c - a)
+
+// fcopysign (???)
+
+// Library calls:
+// These llvm instructions will actually map to library calls.
+// All that's needed, then, is to check that the appropriate library is
+// imported and do a brsl to the proper function name.
+// frem # fmod(x, y): x - (x/y) * y
+// (Note: fmod(double, double), fmodf(float,float)
+// fsqrt?
+// fsin?
+// fcos?
+// Unimplemented SPU instruction space
+// floating reciprocal absolute square root estimate (frsqest)
+
+// The following are probably just intrinsics
+// status and control register write
+// status and control register read
+
+//--------------------------------------
+// Floating Point Conversions
+// Signed conversions:
+def CSiFv4f32:
+    CVTIntFPForm<0b0101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "csflt\t$rT, $rA, 0", SPrecFP,
+      [(set (v4f32 VECREG:$rT), (sint_to_fp (v4i32 VECREG:$rA)))]>;
+
+// Convert signed integer to floating point
+def CSiFf32 :
+    CVTIntFPForm<0b0101101110, (outs R32FP:$rT), (ins R32C:$rA),
+      "csflt\t$rT, $rA, 0", SPrecFP,
+      [(set R32FP:$rT, (sint_to_fp R32C:$rA))]>;
+
+// Convert unsigned into to float
+def CUiFv4f32 :
+    CVTIntFPForm<0b1101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cuflt\t$rT, $rA, 0", SPrecFP,
+      [(set (v4f32 VECREG:$rT), (uint_to_fp (v4i32 VECREG:$rA)))]>;
+
+def CUiFf32 :
+    CVTIntFPForm<0b1101101110, (outs R32FP:$rT), (ins R32C:$rA),
+      "cuflt\t$rT, $rA, 0", SPrecFP,
+      [(set R32FP:$rT, (uint_to_fp R32C:$rA))]>;
+
+// Convert float to unsigned int
+// Assume that scale = 0
+
+def CFUiv4f32 :
+    CVTIntFPForm<0b1101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cfltu\t$rT, $rA, 0", SPrecFP,
+      [(set (v4i32 VECREG:$rT), (fp_to_uint (v4f32 VECREG:$rA)))]>;
+
+def CFUif32 :
+    CVTIntFPForm<0b1101101110, (outs R32C:$rT), (ins R32FP:$rA),
+      "cfltu\t$rT, $rA, 0", SPrecFP,
+      [(set R32C:$rT, (fp_to_uint R32FP:$rA))]>;
+
+// Convert float to signed int
+// Assume that scale = 0
+
+def CFSiv4f32 :
+    CVTIntFPForm<0b1101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cflts\t$rT, $rA, 0", SPrecFP,
+      [(set (v4i32 VECREG:$rT), (fp_to_sint (v4f32 VECREG:$rA)))]>;
+
+def CFSif32 :
+    CVTIntFPForm<0b1101101110, (outs R32C:$rT), (ins R32FP:$rA),
+      "cflts\t$rT, $rA, 0", SPrecFP,
+      [(set R32C:$rT, (fp_to_sint R32FP:$rA))]>;
+
+//===----------------------------------------------------------------------==//
+// Single<->Double precision conversions
+//===----------------------------------------------------------------------==//
+
+// NOTE: We use "vec" name suffix here to avoid confusion (e.g. input is a
+// v4f32, output is v2f64--which goes in the name?)
+
+// Floating point extend single to double
+// NOTE: Not sure if passing in v4f32 to FESDvec is correct since it
+// operates on two double-word slots (i.e. 1st and 3rd fp numbers
+// are ignored).
+def FESDvec :
+    RRForm_1<0b00011101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "fesd\t$rT, $rA", SPrecFP,
+      [/*(set (v2f64 VECREG:$rT), (fextend (v4f32 VECREG:$rA)))*/]>;
+
+def FESDf32 :
+    RRForm_1<0b00011101110, (outs R64FP:$rT), (ins R32FP:$rA),
+      "fesd\t$rT, $rA", SPrecFP,
+      [(set R64FP:$rT, (fextend R32FP:$rA))]>;
+
+// Floating point round double to single
+//def FRDSvec :
+//    RRForm_1<0b10011101110, (outs VECREG:$rT), (ins VECREG:$rA),
+//      "frds\t$rT, $rA,", SPrecFP,
+//      [(set (v4f32 R32FP:$rT), (fround (v2f64 R64FP:$rA)))]>;
+
+def FRDSf64 :
+    RRForm_1<0b10011101110, (outs R32FP:$rT), (ins R64FP:$rA),
+      "frds\t$rT, $rA", SPrecFP,
+      [(set R32FP:$rT, (fround R64FP:$rA))]>;
+
+//ToDo include anyextend?
+
+//===----------------------------------------------------------------------==//
+// Double precision floating point instructions
+//===----------------------------------------------------------------------==//
+def FAf64 :
+    RRForm<0b00110011010, (outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+      "dfa\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fadd R64FP:$rA, R64FP:$rB))]>;
+
+def FAv2f64 :
+    RRForm<0b00110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "dfa\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT), (fadd (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)))]>;
+
+def FSf64 :
+    RRForm<0b10100011010, (outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+      "dfs\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fsub R64FP:$rA, R64FP:$rB))]>;
+
+def FSv2f64 :
+    RRForm<0b10100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "dfs\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fsub (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)))]>;
+
+def FMf64 :
+    RRForm<0b01100011010, (outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+      "dfm\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fmul R64FP:$rA, R64FP:$rB))]>;
+
+def FMv2f64:
+    RRForm<0b00100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "dfm\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)))]>;
+
+def FMAf64:
+    RRForm<0b00111010110, (outs R64FP:$rT),
+                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+      "dfma\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fadd R64FP:$rC, (fmul R64FP:$rA, R64FP:$rB)))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FMAv2f64:
+    RRForm<0b00111010110, (outs VECREG:$rT),
+                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "dfma\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fadd (v2f64 VECREG:$rC),
+                  (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB))))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FMSf64 :
+    RRForm<0b10111010110, (outs R64FP:$rT),
+                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+      "dfms\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fsub (fmul R64FP:$rA, R64FP:$rB), R64FP:$rC))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FMSv2f64 :
+    RRForm<0b10111010110, (outs VECREG:$rT),
+                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "dfms\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fsub (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)),
+                  (v2f64 VECREG:$rC)))]>;
+
+// DFNMS: - (a * b - c)
+// - (a * b) + c => c - (a * b)
+
+class DFNMSInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01111010110, OOL, IOL, "dfnms\t$rT, $rA, $rB",
+           DPrecFP, pattern>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+class DFNMSVecInst<list<dag> pattern>:
+    DFNMSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+              pattern>;
+
+class DFNMSRegInst<list<dag> pattern>:
+    DFNMSInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+             pattern>;
+
+multiclass DFMultiplySubtract
+{
+  def v2f64 : DFNMSVecInst<[(set (v2f64 VECREG:$rT), 
+                                 (fsub (v2f64 VECREG:$rC),
+                                       (fmul (v2f64 VECREG:$rA),
+                                             (v2f64 VECREG:$rB))))]>;
+
+  def f64 : DFNMSRegInst<[(set R64FP:$rT,
+                               (fsub R64FP:$rC,
+                                     (fmul R64FP:$rA, R64FP:$rB)))]>;
+}
+
+defm DFNMS : DFMultiplySubtract;
+
+// - (a * b + c)
+// - (a * b) - c
+def FNMAf64 :
+    RRForm<0b11111010110, (outs R64FP:$rT),
+                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+      "dfnma\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fneg (fadd R64FP:$rC, (fmul R64FP:$rA, R64FP:$rB))))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FNMAv2f64 :
+    RRForm<0b11111010110, (outs VECREG:$rT),
+                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "dfnma\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fneg (fadd (v2f64 VECREG:$rC),
+                        (fmul (v2f64 VECREG:$rA),
+                              (v2f64 VECREG:$rB)))))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+//===----------------------------------------------------------------------==//
+// Floating point negation and absolute value
+//===----------------------------------------------------------------------==//
+
+def : Pat<(fneg (v4f32 VECREG:$rA)),
+          (XORfnegvec (v4f32 VECREG:$rA),
+                      (v4f32 (ILHUv4i32 0x8000)))>;
+
+def : Pat<(fneg R32FP:$rA),
+          (XORfneg32 R32FP:$rA, (ILHUr32 0x8000))>;
+
+// Floating point absolute value
+// Note: f64 fabs is custom-selected.
+
+def : Pat<(fabs R32FP:$rA),
+          (ANDfabs32 R32FP:$rA, (IOHLr32 (ILHUr32 0x7fff), 0xffff))>;
+
+def : Pat<(fabs (v4f32 VECREG:$rA)),
+          (ANDfabsvec (v4f32 VECREG:$rA),
+                      (IOHLv4i32 (ILHUv4i32 0x7fff), 0xffff))>;
+
+//===----------------------------------------------------------------------===//
+// Hint for branch instructions:
+//===----------------------------------------------------------------------===//
+def HBRA :
+    HBI16Form<0b0001001,(ins hbrtarget:$brinst, brtarget:$btarg), "hbra\t$brinst, $btarg">;
+
+//===----------------------------------------------------------------------===//
+// Execution, Load NOP (execute NOPs belong in even pipeline, load NOPs belong
+// in the odd pipeline)
+//===----------------------------------------------------------------------===//
+
+def ENOP : SPUInstr<(outs), (ins), "nop", ExecNOP> {
+  let Pattern = [];
+
+  let Inst{0-10} = 0b10000000010;
+  let Inst{11-17} = 0;
+  let Inst{18-24} = 0;
+  let Inst{25-31} = 0;
+}
+
+def LNOP : SPUInstr<(outs), (ins), "lnop", LoadNOP> {
+  let Pattern = [];
+
+  let Inst{0-10} = 0b10000000000;
+  let Inst{11-17} = 0;
+  let Inst{18-24} = 0;
+  let Inst{25-31} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Bit conversions (type conversions between vector/packed types)
+// NOTE: Promotions are handled using the XS* instructions.
+//===----------------------------------------------------------------------===//
+def : Pat<(v16i8 (bitconvert (v8i16 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VECREG:$src))), (v16i8 VECREG:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VECREG:$src))), (v8i16 VECREG:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VECREG:$src))), (v4i32 VECREG:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v16i8 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 VECREG:$src))), (v2i64 VECREG:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VECREG:$src))), (v4f32 VECREG:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v16i8 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VECREG:$src))), (v2f64 VECREG:$src)>;
+
+def : Pat<(i128 (bitconvert (v16i8 VECREG:$src))),
+          (COPY_TO_REGCLASS VECREG:$src, GPRC)>;
+def : Pat<(i128 (bitconvert (v8i16 VECREG:$src))),
+          (COPY_TO_REGCLASS VECREG:$src, GPRC)>;
+def : Pat<(i128 (bitconvert (v4i32 VECREG:$src))),
+          (COPY_TO_REGCLASS VECREG:$src, GPRC)>;
+def : Pat<(i128 (bitconvert (v2i64 VECREG:$src))),
+          (COPY_TO_REGCLASS VECREG:$src, GPRC)>;
+def : Pat<(i128 (bitconvert (v4f32 VECREG:$src))),
+          (COPY_TO_REGCLASS VECREG:$src, GPRC)>;
+def : Pat<(i128 (bitconvert (v2f64 VECREG:$src))),
+          (COPY_TO_REGCLASS VECREG:$src, GPRC)>;
+
+def : Pat<(v16i8 (bitconvert (i128 GPRC:$src))),
+          (v16i8 (COPY_TO_REGCLASS GPRC:$src, VECREG))>;
+def : Pat<(v8i16 (bitconvert (i128 GPRC:$src))),
+          (v8i16 (COPY_TO_REGCLASS GPRC:$src, VECREG))>;
+def : Pat<(v4i32 (bitconvert (i128 GPRC:$src))),
+          (v4i32 (COPY_TO_REGCLASS GPRC:$src, VECREG))>;
+def : Pat<(v2i64 (bitconvert (i128 GPRC:$src))),
+          (v2i64 (COPY_TO_REGCLASS GPRC:$src, VECREG))>;
+def : Pat<(v4f32 (bitconvert (i128 GPRC:$src))),
+          (v4f32 (COPY_TO_REGCLASS GPRC:$src, VECREG))>;
+def : Pat<(v2f64 (bitconvert (i128 GPRC:$src))),
+          (v2f64 (COPY_TO_REGCLASS GPRC:$src, VECREG))>;
+
+def : Pat<(i32 (bitconvert R32FP:$rA)),
+          (COPY_TO_REGCLASS R32FP:$rA, R32C)>;
+
+def : Pat<(f32 (bitconvert R32C:$rA)),
+          (COPY_TO_REGCLASS R32C:$rA, R32FP)>;
+
+def : Pat<(i64 (bitconvert R64FP:$rA)),
+          (COPY_TO_REGCLASS R64FP:$rA, R64C)>;
+
+def : Pat<(f64 (bitconvert R64C:$rA)),
+          (COPY_TO_REGCLASS R64C:$rA, R64FP)>;
+
+
+//===----------------------------------------------------------------------===//
+// Instruction patterns:
+//===----------------------------------------------------------------------===//
+
+// General 32-bit constants:
+def : Pat<(i32 imm:$imm),
+          (IOHLr32 (ILHUr32 (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Single precision float constants:
+def : Pat<(f32 fpimm:$imm),
+          (IOHLf32 (ILHUf32 (HI16_f32 fpimm:$imm)), (LO16_f32 fpimm:$imm))>;
+
+// General constant 32-bit vectors
+def : Pat<(v4i32 v4i32Imm:$imm),
+          (IOHLv4i32 (v4i32 (ILHUv4i32 (HI16_vec v4i32Imm:$imm))),
+                     (LO16_vec v4i32Imm:$imm))>;
+
+// 8-bit constants
+def : Pat<(i8 imm:$imm),
+          (ILHr8 imm:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Zero/Any/Sign extensions
+//===----------------------------------------------------------------------===//
+
+// sext 8->32: Sign extend bytes to words
+def : Pat<(sext_inreg R32C:$rSrc, i8),
+          (XSHWr32 (XSBHr32 R32C:$rSrc))>;
+
+def : Pat<(i32 (sext R8C:$rSrc)),
+          (XSHWr16 (XSBHr8 R8C:$rSrc))>;
+
+// sext 8->64: Sign extend bytes to double word
+def : Pat<(sext_inreg R64C:$rSrc, i8),
+          (XSWDr64_inreg (XSHWr64 (XSBHr64 R64C:$rSrc)))>;
+          
+def : Pat<(i64 (sext R8C:$rSrc)),
+          (XSWDr64 (XSHWr16 (XSBHr8 R8C:$rSrc)))>;
+
+// zext 8->16: Zero extend bytes to halfwords
+def : Pat<(i16 (zext R8C:$rSrc)),
+          (ANDHIi8i16 R8C:$rSrc, 0xff)>;
+
+// zext 8->32: Zero extend bytes to words
+def : Pat<(i32 (zext R8C:$rSrc)),
+          (ANDIi8i32 R8C:$rSrc, 0xff)>;
+
+// zext 8->64: Zero extend bytes to double words
+def : Pat<(i64 (zext R8C:$rSrc)),
+          (COPY_TO_REGCLASS (SELBv4i32 (ROTQMBYv4i32
+                                    (COPY_TO_REGCLASS 
+                                       (ANDIi8i32 R8C:$rSrc,0xff), VECREG),
+                                    0x4),
+                                  (ILv4i32 0x0),
+                                  (FSMBIv4i32 0x0f0f)), R64C)>;
+
+// anyext 8->16: Extend 8->16 bits, irrespective of sign, preserves high bits
+def : Pat<(i16 (anyext R8C:$rSrc)),
+          (ORHIi8i16 R8C:$rSrc, 0)>;
+
+// anyext 8->32: Extend 8->32 bits, irrespective of sign, preserves high bits
+def : Pat<(i32 (anyext R8C:$rSrc)),
+          (COPY_TO_REGCLASS R8C:$rSrc, R32C)>;
+
+// sext 16->64: Sign extend halfword to double word
+def : Pat<(sext_inreg R64C:$rSrc, i16),
+          (XSWDr64_inreg (XSHWr64 R64C:$rSrc))>;
+          
+def : Pat<(sext R16C:$rSrc),
+          (XSWDr64 (XSHWr16 R16C:$rSrc))>;
+
+// zext 16->32: Zero extend halfwords to words
+def : Pat<(i32 (zext R16C:$rSrc)),
+          (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff))>;
+
+def : Pat<(i32 (zext (and R16C:$rSrc, 0xf))),
+          (ANDIi16i32 R16C:$rSrc, 0xf)>;
+
+def : Pat<(i32 (zext (and R16C:$rSrc, 0xff))),
+          (ANDIi16i32 R16C:$rSrc, 0xff)>;
+
+def : Pat<(i32 (zext (and R16C:$rSrc, 0xfff))),
+          (ANDIi16i32 R16C:$rSrc, 0xfff)>;
+
+// anyext 16->32: Extend 16->32 bits, irrespective of sign
+def : Pat<(i32 (anyext R16C:$rSrc)),
+          (COPY_TO_REGCLASS R16C:$rSrc, R32C)>;
+
+//===----------------------------------------------------------------------===//
+// Truncates:
+// These truncates are for the SPU's supported types (i8, i16, i32). i64 and
+// above are custom lowered.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(i8 (trunc GPRC:$src)),
+          (COPY_TO_REGCLASS
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0f0f), 0x0f0f)), R8C)>;
+
+def : Pat<(i8 (trunc R64C:$src)),
+          (COPY_TO_REGCLASS
+            (SHUFBv2i64_m32
+              (COPY_TO_REGCLASS R64C:$src, VECREG),
+              (COPY_TO_REGCLASS R64C:$src, VECREG),
+              (IOHLv4i32 (ILHUv4i32 0x0707), 0x0707)), R8C)>;
+
+def : Pat<(i8 (trunc R32C:$src)),
+          (COPY_TO_REGCLASS
+            (SHUFBv4i32_m32
+               (COPY_TO_REGCLASS R32C:$src, VECREG),
+               (COPY_TO_REGCLASS R32C:$src, VECREG),
+               (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)), R8C)>;
+
+def : Pat<(i8 (trunc R16C:$src)),
+          (COPY_TO_REGCLASS
+            (SHUFBv4i32_m32
+               (COPY_TO_REGCLASS R16C:$src, VECREG),
+               (COPY_TO_REGCLASS R16C:$src, VECREG),
+               (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)), R8C)>;
+
+def : Pat<(i16 (trunc GPRC:$src)),
+          (COPY_TO_REGCLASS
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0e0f), 0x0e0f)), R16C)>;
+
+def : Pat<(i16 (trunc R64C:$src)),
+          (COPY_TO_REGCLASS
+            (SHUFBv2i64_m32
+              (COPY_TO_REGCLASS R64C:$src, VECREG),
+              (COPY_TO_REGCLASS R64C:$src, VECREG),
+              (IOHLv4i32 (ILHUv4i32 0x0607), 0x0607)), R16C)>;
+
+def : Pat<(i16 (trunc R32C:$src)),
+          (COPY_TO_REGCLASS
+            (SHUFBv4i32_m32
+               (COPY_TO_REGCLASS R32C:$src, VECREG),
+               (COPY_TO_REGCLASS R32C:$src, VECREG),
+               (IOHLv4i32 (ILHUv4i32 0x0203), 0x0203)), R16C)>;
+
+def : Pat<(i32 (trunc GPRC:$src)),
+          (COPY_TO_REGCLASS
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0c0d), 0x0e0f)), R32C)>;
+
+def : Pat<(i32 (trunc R64C:$src)),
+          (COPY_TO_REGCLASS
+            (SHUFBv2i64_m32
+              (COPY_TO_REGCLASS R64C:$src, VECREG),
+              (COPY_TO_REGCLASS R64C:$src, VECREG),
+              (IOHLv4i32 (ILHUv4i32 0x0405), 0x0607)), R32C)>;
+
+//===----------------------------------------------------------------------===//
+// Address generation: SPU, like PPC, has to split addresses into high and
+// low parts in order to load them into a register.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(SPUaform tglobaladdr:$in, 0),  (ILAlsa tglobaladdr:$in)>;
+def : Pat<(SPUaform texternalsym:$in, 0), (ILAlsa texternalsym:$in)>;
+def : Pat<(SPUaform tjumptable:$in, 0),   (ILAlsa tjumptable:$in)>;
+def : Pat<(SPUaform tconstpool:$in, 0),   (ILAlsa  tconstpool:$in)>;
+
+def : Pat<(SPUindirect (SPUhi tglobaladdr:$in, 0),
+                       (SPUlo tglobaladdr:$in, 0)),
+          (IOHLlo (ILHUhi tglobaladdr:$in), tglobaladdr:$in)>;
+
+def : Pat<(SPUindirect (SPUhi texternalsym:$in, 0),
+                       (SPUlo texternalsym:$in, 0)),
+          (IOHLlo (ILHUhi texternalsym:$in), texternalsym:$in)>;
+
+def : Pat<(SPUindirect (SPUhi tjumptable:$in, 0),
+                       (SPUlo tjumptable:$in, 0)),
+          (IOHLlo (ILHUhi tjumptable:$in), tjumptable:$in)>;
+
+def : Pat<(SPUindirect (SPUhi tconstpool:$in, 0),
+                       (SPUlo tconstpool:$in, 0)),
+          (IOHLlo (ILHUhi tconstpool:$in), tconstpool:$in)>;
+
+def : Pat<(add (SPUhi tglobaladdr:$in, 0), (SPUlo tglobaladdr:$in, 0)),
+          (IOHLlo (ILHUhi tglobaladdr:$in), tglobaladdr:$in)>;
+
+def : Pat<(add (SPUhi texternalsym:$in, 0), (SPUlo texternalsym:$in, 0)),
+          (IOHLlo (ILHUhi texternalsym:$in), texternalsym:$in)>;
+
+def : Pat<(add (SPUhi tjumptable:$in, 0), (SPUlo tjumptable:$in, 0)),
+          (IOHLlo (ILHUhi tjumptable:$in), tjumptable:$in)>;
+
+def : Pat<(add (SPUhi tconstpool:$in, 0), (SPUlo tconstpool:$in, 0)),
+          (IOHLlo (ILHUhi tconstpool:$in), tconstpool:$in)>;
+
+// Intrinsics:
+include "CellSDKIntrinsics.td"
+// Various math operator instruction sequences
+include "SPUMathInstr.td"
+// 64-bit "instructions"/support
+include "SPU64InstrInfo.td"
+// 128-bit "instructions"/support
+include "SPU128InstrInfo.td"
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUMachineFunction.cpp b/contrib/llvm/lib/Target/CellSPU/SPUMachineFunction.cpp
new file mode 100644
index 000000000000..3e948d071d63
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUMachineFunction.cpp
@@ -0,0 +1,14 @@
+//==-- SPUMachineFunctionInfo.cpp - Private data used for CellSPU ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUMachineFunction.h"
+
+using namespace llvm;
+
+void SPUFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUMachineFunction.h b/contrib/llvm/lib/Target/CellSPU/SPUMachineFunction.h
new file mode 100644
index 000000000000..399684bb0887
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUMachineFunction.h
@@ -0,0 +1,50 @@
+//===-- SPUMachineFunctionInfo.h - Private data used for CellSPU --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the IBM Cell SPU specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_MACHINE_FUNCTION_INFO_H
+#define SPU_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// SPUFunctionInfo - Cell SPU target-specific information for each
+/// MachineFunction
+class SPUFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// UsesLR - Indicates whether LR is used in the current function.
+  ///
+  bool UsesLR;
+
+  // VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+public:
+  SPUFunctionInfo(MachineFunction& MF) 
+  : UsesLR(false),
+    VarArgsFrameIndex(0)
+  {}
+
+  void setUsesLR(bool U) { UsesLR = U; }
+  bool usesLR()          { return UsesLR; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+};
+
+} // end of namespace llvm
+
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUMathInstr.td b/contrib/llvm/lib/Target/CellSPU/SPUMathInstr.td
new file mode 100644
index 000000000000..9a5c3976afbe
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUMathInstr.td
@@ -0,0 +1,97 @@
+//===-- SPUMathInst.td - Cell SPU math operations ---------*- tablegen -*--===//
+//
+//                     Cell SPU math operations
+//
+// This target description file contains instruction sequences for various
+// math operations, such as vector multiplies, i32 multiply, etc., for the
+// SPU's i32, i16 i8 and corresponding vector types.
+//
+// Any resemblance to libsimdmath or the Cell SDK simdmath library is
+// purely and completely coincidental.
+//===----------------------------------------------------------------------===//
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v16i8 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)),
+          (ORv4i32
+           (ANDv4i32
+            (SELBv4i32 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAHIv8i16 VECREG:$rA, 8),
+                                             (ROTMAHIv8i16 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)),
+            (ILAv4i32 0x0000ffff)),
+           (SHLIv4i32
+            (SELBv4i32 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 16),
+                                 (ROTMAIv4i32_i32 VECREG:$rB, 16)),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 8),
+                                             (ROTMAIv4i32_i32 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)), 16))>;
+                        
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v8i16 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (SELBv8i16 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                     (SHLIv4i32 (MPYHHv8i16 VECREG:$rA, VECREG:$rB), 16),
+                     (FSMBIv8i16 0xcccc))>;
+                 
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v4i32, i32 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def MPYv4i32:
+  Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
+      (Av4i32
+        (v4i32 (Av4i32 (v4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB)),
+                       (v4i32 (MPYHv4i32 VECREG:$rB, VECREG:$rA)))),
+        (v4i32 (MPYUv4i32 VECREG:$rA, VECREG:$rB)))>;
+
+def MPYi32:
+  Pat<(mul R32C:$rA, R32C:$rB),
+      (Ar32
+        (Ar32 (MPYHr32 R32C:$rA, R32C:$rB),
+              (MPYHr32 R32C:$rB, R32C:$rA)),
+        (MPYUr32 R32C:$rA, R32C:$rB))>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// f32, v4f32 divide instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// Reciprocal estimate and interpolation
+def Interpf32: CodeFrag<(FIf32 R32FP:$rB, (FRESTf32 R32FP:$rB))>;
+// Division estimate
+def DivEstf32: CodeFrag<(FMf32 R32FP:$rA, Interpf32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphf32: CodeFrag<(FMAf32 (FNMSf32 DivEstf32.Fragment, R32FP:$rB, R32FP:$rA),
+                               Interpf32.Fragment,
+                               DivEstf32.Fragment)>;
+// Epsilon addition
+def Epsilonf32: CodeFrag<(AIf32 NRaphf32.Fragment, 1)>;
+
+def : Pat<(fdiv R32FP:$rA, R32FP:$rB),
+          (SELBf32_cond NRaphf32.Fragment,
+                        Epsilonf32.Fragment,
+                        (CGTIf32 (FNMSf32 R32FP:$rB, Epsilonf32.Fragment, R32FP:$rA), -1))>;
+
+// Reciprocal estimate and interpolation
+def Interpv4f32: CodeFrag<(FIv4f32 (v4f32 VECREG:$rB), (FRESTv4f32 (v4f32 VECREG:$rB)))>;
+// Division estimate
+def DivEstv4f32: CodeFrag<(FMv4f32 (v4f32 VECREG:$rA), Interpv4f32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphv4f32: CodeFrag<(FMAv4f32 (FNMSv4f32 DivEstv4f32.Fragment,
+                                              (v4f32 VECREG:$rB),
+                                              (v4f32 VECREG:$rA)),
+                                   Interpv4f32.Fragment,
+                                   DivEstv4f32.Fragment)>;
+// Epsilon addition
+def Epsilonv4f32: CodeFrag<(AIv4f32 NRaphv4f32.Fragment, 1)>;
+
+def : Pat<(fdiv (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
+          (SELBv4f32_cond NRaphv4f32.Fragment,
+                        Epsilonv4f32.Fragment,
+                        (CGTIv4f32 (FNMSv4f32 (v4f32 VECREG:$rB),
+                                              Epsilonv4f32.Fragment,
+                                              (v4f32 VECREG:$rA)), -1))>;
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUNodes.td b/contrib/llvm/lib/Target/CellSPU/SPUNodes.td
new file mode 100644
index 000000000000..a47e9ef0167c
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUNodes.td
@@ -0,0 +1,159 @@
+//=== SPUNodes.td - Specialized SelectionDAG nodes by CellSPU -*- tablegen -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Type profiles and SelectionDAG nodes used by CellSPU
+//
+//===----------------------------------------------------------------------===//
+
+// Type profile for a call sequence
+def SDT_SPUCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
+
+// SPU_GenControl: Type profile for generating control words for insertions
+def SPU_GenControl : SDTypeProfile<1, 1, []>;
+def SPUshufmask    : SDNode<"SPUISD::SHUFFLE_MASK", SPU_GenControl, []>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPUCallSeq,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPUCallSeq,
+                           [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+//===----------------------------------------------------------------------===//
+// Operand constraints:
+//===----------------------------------------------------------------------===//
+
+def SDT_SPUCall   : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SPUcall       : SDNode<"SPUISD::CALL", SDT_SPUCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPVariadic]>;
+
+// Operand type constraints for vector shuffle/permute operations
+def SDT_SPUshuffle   : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
+]>;
+
+// Vector binary operator type constraints (needs a further constraint to
+// ensure that operand 0 is a vector...):
+
+def SPUVecBinop: SDTypeProfile<1, 2, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
+]>;
+
+// Trinary operators, e.g., addx, carry generate
+def SPUIntTrinaryOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>
+]>;
+
+// SELECT_MASK type constraints: There are several variations for the various
+// vector types (this avoids having to bit_convert all over the place.)
+def SPUselmask_type: SDTypeProfile<1, 1, [
+  SDTCisInt<1>
+]>;
+
+// SELB type constraints:
+def SPUselb_type: SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<0, 3> ]>;
+
+// SPU Vector shift pseudo-instruction type constraints
+def SPUvecshift_type: SDTypeProfile<1, 2, [
+  SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
+
+// "marker" type for i64 operators that need a shuffle mask
+// (i.e., uses cg or bg or another instruction that needs to
+// use shufb to get things in the right place.)
+// Op0: The result
+// Op1, 2: LHS, RHS
+// Op3: Carry-generate shuffle mask
+
+def SPUmarker_type : SDTypeProfile<1, 3, [
+  SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ]>;
+
+//===----------------------------------------------------------------------===//
+// Synthetic/pseudo-instructions
+//===----------------------------------------------------------------------===//
+
+// SPU CNTB:
+def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>;
+
+// SPU vector shuffle node, matched by the SPUISD::SHUFB enum (see
+// SPUISelLowering.h):
+def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>;
+
+// Vector shifts (ISD::SHL,SRL,SRA are for _integers_ only):
+def SPUvec_shl: SDNode<"ISD::SHL", SPUvecshift_type, []>;
+def SPUvec_srl: SDNode<"ISD::SRL", SPUvecshift_type, []>;
+def SPUvec_sra: SDNode<"ISD::SRA", SPUvecshift_type, []>;
+
+def SPUvec_rotl: SDNode<"SPUISD::VEC_ROTL", SPUvecshift_type, []>;
+def SPUvec_rotr: SDNode<"SPUISD::VEC_ROTR", SPUvecshift_type, []>;
+
+// Vector rotate left, bits shifted out of the left are rotated in on the right
+def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT",
+                             SPUvecshift_type, []>;
+
+// Vector rotate left by bytes, but the count is given in bits and the SPU
+// internally converts it to bytes (saves an instruction to mask off lower
+// three bits)
+def SPUrotbytes_left_bits : SDNode<"SPUISD::ROTBYTES_LEFT_BITS",
+                                   SPUvecshift_type>;
+
+// Shift entire quad left by bytes/bits. Zeros are shifted in on the right
+// SHL_BITS the same as SHL for i128, but ISD::SHL is not implemented for i128
+def SPUshlquad_l_bytes: SDNode<"SPUISD::SHL_BYTES", SPUvecshift_type, []>;
+def SPUshlquad_l_bits: SDNode<"SPUISD::SHL_BITS", SPUvecshift_type, []>;
+def SPUsrl_bytes: SDNode<"SPUISD::SRL_BYTES", SPUvecshift_type, []>;
+
+// SPU form select mask for bytes, immediate
+def SPUselmask: SDNode<"SPUISD::SELECT_MASK", SPUselmask_type, []>;
+
+// SPU select bits instruction
+def SPUselb: SDNode<"SPUISD::SELB", SPUselb_type, []>;
+
+def SDTprefslot2vec: SDTypeProfile<1, 1, []>;
+def SPUprefslot2vec: SDNode<"SPUISD::PREFSLOT2VEC", SDTprefslot2vec, []>;
+
+def SPU_vec_demote   : SDTypeProfile<1, 1, []>;
+def SPUvec2prefslot: SDNode<"SPUISD::VEC2PREFSLOT", SPU_vec_demote, []>;
+
+// Address high and low components, used for [r+r] type addressing
+def SPUhi : SDNode<"SPUISD::Hi", SDTIntBinOp, []>;
+def SPUlo : SDNode<"SPUISD::Lo", SDTIntBinOp, []>;
+
+// PC-relative address
+def SPUpcrel : SDNode<"SPUISD::PCRelAddr", SDTIntBinOp, []>;
+
+// A-Form local store addresses
+def SPUaform : SDNode<"SPUISD::AFormAddr", SDTIntBinOp, []>;
+
+// Indirect [D-Form "imm($reg)" and X-Form "$reg($reg)"] addresses
+def SPUindirect : SDNode<"SPUISD::IndirectAddr", SDTIntBinOp, []>;
+
+// i64 markers: supplies extra operands used to generate the i64 operator
+// instruction sequences
+def SPUadd64 : SDNode<"SPUISD::ADD64_MARKER", SPUmarker_type, []>;
+def SPUsub64 : SDNode<"SPUISD::SUB64_MARKER", SPUmarker_type, []>;
+def SPUmul64 : SDNode<"SPUISD::MUL64_MARKER", SPUmarker_type, []>;
+
+//===----------------------------------------------------------------------===//
+// Constraints: (taken from PPCInstrInfo.td)
+//===----------------------------------------------------------------------===//
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+
+class NoEncode<string E> {
+  string DisableEncoding = E;
+}
+
+//===----------------------------------------------------------------------===//
+// Return (flag isn't quite what it means: the operations are flagged so that
+// instruction scheduling doesn't disassociate them.)
+//===----------------------------------------------------------------------===//
+
+def retflag     : SDNode<"SPUISD::RET_FLAG", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue]>;
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUNopFiller.cpp b/contrib/llvm/lib/Target/CellSPU/SPUNopFiller.cpp
new file mode 100644
index 000000000000..7c58041e3b84
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUNopFiller.cpp
@@ -0,0 +1,153 @@
+//===-- SPUNopFiller.cpp - Add nops/lnops to align the pipelines ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The final pass just before assembly printing. This pass is the last
+// checkpoint where nops and lnops are added to the instruction stream to 
+// satisfy the dual issue requirements. The actual dual issue scheduling is 
+// done (TODO: nowhere, currently)
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPUTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  struct SPUNopFiller : public MachineFunctionPass {
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+    const InstrItineraryData *IID;
+    bool isEvenPlace;  // the instruction slot (mem address) at hand is even/odd
+
+    static char ID;
+    SPUNopFiller(TargetMachine &tm) 
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()), 
+        IID(tm.getInstrItineraryData()) 
+    {
+      DEBUG( dbgs() << "********** SPU Nop filler **********\n" ; );
+    }
+
+    virtual const char *getPassName() const {
+      return "SPU nop/lnop Filler";
+    }
+
+    void runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+    bool runOnMachineFunction(MachineFunction &F) {
+      isEvenPlace = true; //all functions get an .align 3 directive at start 
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        runOnMachineBasicBlock(*FI);
+      return true; //never-ever do any more modifications, just print it!
+    }
+
+    typedef enum { none   = 0, // no more instructions in this function / BB
+                   pseudo = 1, // this does not get executed
+                   even   = 2, 
+                   odd    = 3 } SPUOpPlace;
+    SPUOpPlace getOpPlacement( MachineInstr &instr );
+
+  };
+  char SPUNopFiller::ID = 0;
+
+} 
+
+// Fill a BasicBlock to alignment. 
+// In the assebly we align the functions to 'even' adresses, but
+// basic blocks have an implicit alignmnet. We hereby define 
+// basic blocks to have the same, even, alignment.
+void SPUNopFiller::
+runOnMachineBasicBlock(MachineBasicBlock &MBB) 
+{
+  assert( isEvenPlace && "basic block start from odd address");
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+  {
+    SPUOpPlace this_optype, next_optype;
+    MachineBasicBlock::iterator J = I;
+    J++;
+
+    this_optype = getOpPlacement( *I );
+    next_optype = none;
+    while (J!=MBB.end()){
+      next_optype = getOpPlacement( *J );
+      ++J;
+      if (next_optype != pseudo ) 
+        break;
+    }
+
+    // padd: odd(wrong), even(wrong), ...
+    // to:   nop(corr), odd(corr), even(corr)...
+    if( isEvenPlace && this_optype == odd && next_optype == even ) {
+      DEBUG( dbgs() <<"Adding NOP before: "; );
+      DEBUG( I->dump(); );
+      BuildMI(MBB, I, I->getDebugLoc(), TII->get(SPU::ENOP));
+      isEvenPlace=false;
+    }
+    
+    // padd: even(wrong), odd(wrong), ...
+    // to:   lnop(corr), even(corr), odd(corr)...
+    else if ( !isEvenPlace && this_optype == even && next_optype == odd){
+      DEBUG( dbgs() <<"Adding LNOP before: "; );
+      DEBUG( I->dump(); );
+      BuildMI(MBB, I, I->getDebugLoc(), TII->get(SPU::LNOP));
+      isEvenPlace=true;
+    }
+      
+    // now go to next mem slot
+    if( this_optype != pseudo )
+      isEvenPlace = !isEvenPlace;    
+
+  }
+
+  // padd basicblock end
+  if( !isEvenPlace ){
+    MachineBasicBlock::iterator J = MBB.end();
+    J--;
+    if (getOpPlacement( *J ) == odd) {
+      DEBUG( dbgs() <<"Padding basic block with NOP\n"; );
+      BuildMI(MBB, J, J->getDebugLoc(), TII->get(SPU::ENOP));
+    }  
+    else {
+      J++;
+      DEBUG( dbgs() <<"Padding basic block with LNOP\n"; );
+      BuildMI(MBB, J, DebugLoc(), TII->get(SPU::LNOP));
+    }
+    isEvenPlace=true;
+  }
+}
+
+FunctionPass *llvm::createSPUNopFillerPass(SPUTargetMachine &tm) {
+  return new SPUNopFiller(tm);
+}
+
+// Figure out if 'instr' is executed in the even or odd pipeline
+SPUNopFiller::SPUOpPlace 
+SPUNopFiller::getOpPlacement( MachineInstr &instr ) {
+  int sc = instr.getDesc().getSchedClass();
+  const InstrStage *stage = IID->beginStage(sc);
+  unsigned FUs = stage->getUnits();
+  SPUOpPlace retval;
+
+  switch( FUs ) {
+    case 0: retval = pseudo; break;
+    case 1: retval = odd;    break;
+    case 2: retval = even;   break;
+    default: retval= pseudo; 
+             assert( false && "got unknown FuncUnit\n");
+             break;
+  };
+  return retval;
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUOperands.td b/contrib/llvm/lib/Target/CellSPU/SPUOperands.td
new file mode 100644
index 000000000000..6f8deef5530f
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUOperands.td
@@ -0,0 +1,664 @@
+//===-- SPUOperands.td - Cell SPU Instruction Operands -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Cell SPU Instruction Operands:
+//===----------------------------------------------------------------------===//
+
+// TO_IMM32 - Convert an i8/i16 to i32.
+def TO_IMM32 : SDNodeXForm<imm, [{
+  return getI32Imm(N->getZExtValue());
+}]>;
+
+// TO_IMM16 - Convert an i8/i32 to i16.
+def TO_IMM16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i16);
+}]>;
+
+
+def LO16 : SDNodeXForm<imm, [{
+  unsigned val = N->getZExtValue();
+  // Transformation function: get the low 16 bits.
+  return getI32Imm(val & 0xffff);
+}]>;
+
+def LO16_vec : SDNodeXForm<scalar_to_vector, [{
+  SDValue OpVal(0, 0);
+
+  // Transformation function: get the low 16 bit immediate from a build_vector
+  // node.
+  assert(N->getOpcode() == ISD::BUILD_VECTOR
+         && "LO16_vec got something other than a BUILD_VECTOR");
+
+  // Get first constant operand...
+  for (unsigned i = 0, e = N->getNumOperands();
+       OpVal.getNode() == 0 && i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+  }
+  
+  assert(OpVal.getNode() != 0 && "LO16_vec did not locate a <defined> node");
+  ConstantSDNode *CN = cast<ConstantSDNode>(OpVal);
+  return getI32Imm((unsigned)CN->getZExtValue() & 0xffff);
+}]>;
+
+// Transform an immediate, returning the high 16 bits shifted down:
+def HI16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+// Transformation function: shift the high 16 bit immediate from a build_vector
+// node into the low 16 bits, and return a 16-bit constant.
+def HI16_vec : SDNodeXForm<scalar_to_vector, [{
+  SDValue OpVal(0, 0);
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR
+         && "HI16_vec got something other than a BUILD_VECTOR");
+  
+  // Get first constant operand...
+  for (unsigned i = 0, e = N->getNumOperands();
+       OpVal.getNode() == 0 && i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+  }
+  
+  assert(OpVal.getNode() != 0 && "HI16_vec did not locate a <defined> node");
+  ConstantSDNode *CN = cast<ConstantSDNode>(OpVal);
+  return getI32Imm((unsigned)CN->getZExtValue() >> 16);
+}]>;
+
+// simm7 predicate - True if the immediate fits in an 7-bit signed
+// field.
+def simm7: PatLeaf<(imm), [{
+  int sextVal = int(N->getSExtValue());
+  return (sextVal >= -64 && sextVal <= 63);
+}]>;
+
+// uimm7 predicate - True if the immediate fits in an 7-bit unsigned
+// field.
+def uimm7: PatLeaf<(imm), [{
+  return (N->getZExtValue() <= 0x7f);
+}]>;
+
+// immSExt8 predicate - True if the immediate fits in an 8-bit sign extended
+// field.
+def immSExt8  : PatLeaf<(imm), [{
+  int Value = int(N->getSExtValue());
+  return (Value >= -(1 << 8) && Value <= (1 << 8) - 1);
+}]>;
+
+// immU8: immediate, unsigned 8-bit quantity
+def immU8 : PatLeaf<(imm), [{
+  return (N->getZExtValue() <= 0xff);
+}]>;
+
+// i32ImmSExt10 predicate - True if the i32 immediate fits in a 10-bit sign
+// extended field.  Used by RI10Form instructions like 'ldq'.
+def i32ImmSExt10  : PatLeaf<(imm), [{
+  return isI32IntS10Immediate(N);
+}]>;
+
+// i32ImmUns10 predicate - True if the i32 immediate fits in a 10-bit unsigned
+// field.  Used by RI10Form instructions like 'ldq'.
+def i32ImmUns10  : PatLeaf<(imm), [{
+  return isI32IntU10Immediate(N);
+}]>;
+
+// i16ImmSExt10 predicate - True if the i16 immediate fits in a 10-bit sign
+// extended field.  Used by RI10Form instructions like 'ldq'.
+def i16ImmSExt10  : PatLeaf<(imm), [{
+  return isI16IntS10Immediate(N);
+}]>;
+
+// i16ImmUns10 predicate - True if the i16 immediate fits into a 10-bit unsigned
+// value. Used by RI10Form instructions.
+def i16ImmUns10 : PatLeaf<(imm), [{
+  return isI16IntU10Immediate(N);
+}]>;
+
+def immSExt16  : PatLeaf<(imm), [{
+  // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  short Ignored;
+  return isIntS16Immediate(N, Ignored);
+}]>;
+
+def immZExt16  : PatLeaf<(imm), [{
+  // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+  // field.
+  return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+def immU16 : PatLeaf<(imm), [{
+  // immU16 predicate- True if the immediate fits into a 16-bit unsigned field.
+  return (uint64_t)N->getZExtValue() == (N->getZExtValue() & 0xffff);
+}]>;
+
+def imm18  : PatLeaf<(imm), [{
+  // imm18 predicate: True if the immediate fits into an 18-bit unsigned field.
+  int Value = (int) N->getZExtValue();
+  return isUInt<18>(Value); 
+}]>;
+
+def lo16 : PatLeaf<(imm), [{
+  // lo16 predicate - returns true if the immediate has all zeros in the
+  // low order bits and is a 32-bit constant:
+  if (N->getValueType(0) == MVT::i32) {
+    uint32_t val = N->getZExtValue();
+    return ((val & 0x0000ffff) == val);
+  }
+
+  return false;
+}], LO16>;
+
+def hi16 : PatLeaf<(imm), [{
+  // hi16 predicate - returns true if the immediate has all zeros in the
+  // low order bits and is a 32-bit constant:
+  if (N->getValueType(0) == MVT::i32) {
+    uint32_t val = uint32_t(N->getZExtValue());
+    return ((val & 0xffff0000) == val);
+  } else if (N->getValueType(0) == MVT::i64) {
+    uint64_t val = N->getZExtValue();
+    return ((val & 0xffff0000ULL) == val);
+  }
+
+  return false;
+}], HI16>;
+
+def bitshift : PatLeaf<(imm), [{
+  // bitshift predicate - returns true if 0 < imm <= 7 for SHLQBII
+  // (shift left quadword by bits immediate)
+  int64_t Val = N->getZExtValue();
+  return (Val > 0 && Val <= 7);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Floating point operands:
+//===----------------------------------------------------------------------===//
+
+// Transform a float, returning the high 16 bits shifted down, as if
+// the float was really an unsigned integer:
+def HI16_f32 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm(FloatToBits(fval) >> 16);
+}]>;
+
+// Transformation function on floats: get the low 16 bits as if the float was
+// an unsigned integer.
+def LO16_f32 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm(FloatToBits(fval) & 0xffff);
+}]>;
+
+def FPimm_sext16 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm((int) ((FloatToBits(fval) << 16) >> 16));
+}]>;
+
+def FPimm_u18 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm(FloatToBits(fval) & ((1 << 18) - 1));
+}]>;
+
+def fpimmSExt16 : PatLeaf<(fpimm), [{
+  short Ignored;
+  return isFPS16Immediate(N, Ignored);  
+}], FPimm_sext16>;
+
+// Does the SFP constant only have upp 16 bits set?
+def hi16_f32 : PatLeaf<(fpimm), [{
+  if (N->getValueType(0) == MVT::f32) {
+    uint32_t val = FloatToBits(N->getValueAPF().convertToFloat());
+    return ((val & 0xffff0000) == val);
+  }
+
+  return false;
+}], HI16_f32>;
+
+// Does the SFP constant fit into 18 bits?
+def fpimm18  : PatLeaf<(fpimm), [{
+  if (N->getValueType(0) == MVT::f32) {
+    uint32_t Value = FloatToBits(N->getValueAPF().convertToFloat());
+    return isUInt<18>(Value);
+  }
+
+  return false;
+}], FPimm_u18>;
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands (TODO):
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// build_vector operands:
+//===----------------------------------------------------------------------===//
+
+// v16i8SExt8Imm_xform function: convert build_vector to 8-bit sign extended
+// immediate constant load for v16i8 vectors. N.B.: The incoming constant has
+// to be a 16-bit quantity with the upper and lower bytes equal (e.g., 0x2a2a).
+def v16i8SExt8Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8);
+}]>;
+
+// v16i8SExt8Imm: Predicate test for 8-bit sign extended immediate constant
+// load, works in conjunction with its transform function. N.B.: This relies the
+// incoming constant being a 16-bit quantity, where the upper and lower bytes
+// are EXACTLY the same (e.g., 0x2a2a)
+def v16i8SExt8Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8).getNode() != 0;
+}], v16i8SExt8Imm_xform>;
+
+// v16i8U8Imm_xform function: convert build_vector to unsigned 8-bit
+// immediate constant load for v16i8 vectors. N.B.: The incoming constant has
+// to be a 16-bit quantity with the upper and lower bytes equal (e.g., 0x2a2a).
+def v16i8U8Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8);
+}]>;
+
+// v16i8U8Imm: Predicate test for unsigned 8-bit immediate constant
+// load, works in conjunction with its transform function. N.B.: This relies the
+// incoming constant being a 16-bit quantity, where the upper and lower bytes
+// are EXACTLY the same (e.g., 0x2a2a)
+def v16i8U8Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8).getNode() != 0;
+}], v16i8U8Imm_xform>;
+
+// v8i16SExt8Imm_xform function: convert build_vector to 8-bit sign extended
+// immediate constant load for v8i16 vectors.
+def v8i16SExt8Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16SExt8Imm: Predicate test for 8-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v8i16SExt8Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16SExt8Imm_xform>;
+
+// v8i16SExt10Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v8i16 vectors.
+def v8i16SExt10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16SExt10Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v8i16SExt10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16SExt10Imm_xform>;
+
+// v8i16Uns10Imm_xform function: convert build_vector to 16-bit unsigned
+// immediate constant load for v8i16 vectors.
+def v8i16Uns10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16Uns10Imm: Predicate test for 16-bit unsigned immediate constant
+// load, works in conjunction with its transform function.
+def v8i16Uns10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16Uns10Imm_xform>;
+
+// v8i16SExt16Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v8i16 vectors.
+def v8i16Uns16Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16SExt16Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v8i16SExt16Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16Uns16Imm_xform>;
+
+// v4i32SExt10Imm_xform function: convert build_vector to 10-bit sign extended
+// immediate constant load for v4i32 vectors.
+def v4i32SExt10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32SExt10Imm: Predicate test for 10-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v4i32SExt10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32SExt10Imm_xform>;
+
+// v4i32Uns10Imm_xform function: convert build_vector to 10-bit unsigned
+// immediate constant load for v4i32 vectors.
+def v4i32Uns10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32Uns10Imm: Predicate test for 10-bit unsigned immediate constant
+// load, works in conjunction with its transform function.
+def v4i32Uns10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32Uns10Imm_xform>;
+
+// v4i32SExt16Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v4i32 vectors.
+def v4i32SExt16Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32SExt16Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v4i32SExt16Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32SExt16Imm_xform>;
+
+// v4i32Uns18Imm_xform function: convert build_vector to 18-bit unsigned
+// immediate constant load for v4i32 vectors.
+def v4i32Uns18Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32Uns18Imm: Predicate test for 18-bit unsigned immediate constant load,
+// works in conjunction with its transform function.
+def v4i32Uns18Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32Uns18Imm_xform>;
+
+// ILHUvec_get_imm xform function: convert build_vector to ILHUvec imm constant
+// load.
+def ILHUvec_get_imm: SDNodeXForm<build_vector, [{
+  return SPU::get_ILHUvec_imm(N, *CurDAG, MVT::i32);
+}]>;
+
+/// immILHUvec: Predicate test for a ILHU constant vector.
+def immILHUvec: PatLeaf<(build_vector), [{
+  return SPU::get_ILHUvec_imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], ILHUvec_get_imm>;
+
+// Catch-all for any other i32 vector constants
+def v4i32_get_imm: SDNodeXForm<build_vector, [{
+  return SPU::get_v4i32_imm(N, *CurDAG);
+}]>;
+
+def v4i32Imm: PatLeaf<(build_vector), [{
+  return SPU::get_v4i32_imm(N, *CurDAG).getNode() != 0;
+}], v4i32_get_imm>;
+
+// v2i64SExt10Imm_xform function: convert build_vector to 10-bit sign extended
+// immediate constant load for v2i64 vectors.
+def v2i64SExt10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i64);
+}]>;
+
+// v2i64SExt10Imm: Predicate test for 10-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v2i64SExt10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], v2i64SExt10Imm_xform>;
+
+// v2i64SExt16Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v2i64 vectors.
+def v2i64SExt16Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i64);
+}]>;
+
+// v2i64SExt16Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v2i64SExt16Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], v2i64SExt16Imm_xform>;
+
+// v2i64Uns18Imm_xform function: convert build_vector to 18-bit unsigned
+// immediate constant load for v2i64 vectors.
+def v2i64Uns18Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i64);
+}]>;
+
+// v2i64Uns18Imm: Predicate test for 18-bit unsigned immediate constant load,
+// works in conjunction with its transform function.
+def v2i64Uns18Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], v2i64Uns18Imm_xform>;
+
+/// immILHUvec: Predicate test for a ILHU constant vector.
+def immILHUvec_i64: PatLeaf<(build_vector), [{
+  return SPU::get_ILHUvec_imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], ILHUvec_get_imm>;
+
+// Catch-all for any other i32 vector constants
+def v2i64_get_imm: SDNodeXForm<build_vector, [{
+  return SPU::get_v2i64_imm(N, *CurDAG);
+}]>;
+
+def v2i64Imm: PatLeaf<(build_vector), [{
+  return SPU::get_v2i64_imm(N, *CurDAG).getNode() != 0;
+}], v2i64_get_imm>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+
+def s7imm: Operand<i8> {
+  let PrintMethod = "printS7ImmOperand";
+}
+
+def s7imm_i8: Operand<i8> {
+  let PrintMethod = "printS7ImmOperand";
+}
+
+def u7imm: Operand<i16> {
+  let PrintMethod = "printU7ImmOperand";
+}
+
+def u7imm_i8: Operand<i8> {
+  let PrintMethod = "printU7ImmOperand";
+}
+
+def u7imm_i32: Operand<i32> {
+  let PrintMethod = "printU7ImmOperand";
+}
+
+// Halfword, signed 10-bit constant
+def s10imm : Operand<i16> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+def s10imm_i8: Operand<i8> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+def s10imm_i32: Operand<i32> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+def s10imm_i64: Operand<i64> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+// Unsigned 10-bit integers:
+def u10imm: Operand<i16> {
+  let PrintMethod = "printU10ImmOperand";
+}
+
+def u10imm_i8: Operand<i8> {
+  let PrintMethod = "printU10ImmOperand";
+}
+
+def u10imm_i32: Operand<i32> {
+  let PrintMethod = "printU10ImmOperand";
+}
+
+def s16imm  : Operand<i16> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_i8: Operand<i8> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_i32: Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_i64: Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_f32: Operand<f32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_f64: Operand<f64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def u16imm_i64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u16imm_i32 : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def f16imm : Operand<f32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def s18imm  : Operand<i32> {
+  let PrintMethod = "printS18ImmOperand";
+}
+
+def u18imm : Operand<i32> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+def u18imm_i64 : Operand<i64> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+def f18imm : Operand<f32> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+def f18imm_f64 : Operand<f64> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+// Negated 7-bit halfword rotate immediate operands
+def rothNeg7imm : Operand<i32> {
+  let PrintMethod = "printROTHNeg7Imm";
+}
+
+def rothNeg7imm_i16 : Operand<i16> {
+  let PrintMethod = "printROTHNeg7Imm";
+}
+
+// Negated 7-bit word rotate immediate operands
+def rotNeg7imm : Operand<i32> {
+  let PrintMethod = "printROTNeg7Imm";
+}
+
+def rotNeg7imm_i16 : Operand<i16> {
+  let PrintMethod = "printROTNeg7Imm";
+}
+
+def rotNeg7imm_i8 : Operand<i8> {
+  let PrintMethod = "printROTNeg7Imm";
+}
+
+def target : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+}
+
+// Absolute address call target
+def calltarget : Operand<iPTR> {
+  let PrintMethod = "printCallOperand";
+  let MIOperandInfo = (ops u18imm:$calldest);
+}
+
+// PC relative call target
+def relcalltarget : Operand<iPTR> {
+  let PrintMethod = "printPCRelativeOperand";
+  let MIOperandInfo = (ops s16imm:$calldest);
+}
+
+// Branch targets:
+def brtarget : Operand<OtherVT> {
+  let PrintMethod = "printPCRelativeOperand";
+}
+
+// Hint for branch target
+def hbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printHBROperand";
+}
+
+// Indirect call target
+def indcalltarget : Operand<iPTR> {
+  let PrintMethod = "printCallOperand";
+  let MIOperandInfo = (ops ptr_rc:$calldest);
+}
+
+def symbolHi: Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+}
+
+def symbolLo: Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+}
+
+def symbolLSA: Operand<i32> {
+  let PrintMethod = "printSymbolLSA";
+}
+
+// Shuffle address memory operaand [s7imm(reg) d-format]
+def shufaddr : Operand<iPTR> {
+  let PrintMethod = "printShufAddr";
+  let MIOperandInfo = (ops s7imm:$imm, ptr_rc:$reg);
+}
+
+// memory s10imm(reg) operand
+def dformaddr : Operand<iPTR> {
+  let PrintMethod = "printDFormAddr";
+  let MIOperandInfo = (ops s10imm:$imm, ptr_rc:$reg);
+}
+
+// 256K local store address
+// N.B.: The tblgen code generator expects to have two operands, an offset
+// and a pointer. Of these, only the immediate is actually used.
+def addr256k : Operand<iPTR> {
+  let PrintMethod = "printAddr256K";
+  let MIOperandInfo = (ops s16imm:$imm, ptr_rc:$reg);
+}
+
+// memory s18imm(reg) operand
+def memri18 : Operand<iPTR> {
+  let PrintMethod = "printMemRegImmS18";
+  let MIOperandInfo = (ops s18imm:$imm, ptr_rc:$reg);
+}
+
+// memory register + register operand
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc:$reg_a, ptr_rc:$reg_b);
+}
+
+// Define SPU-specific addressing modes: These come in three basic
+// flavors:
+//
+// D-form   : [r+I10] (10-bit signed offset + reg)
+// X-form   : [r+r]   (reg+reg)
+// A-form   : abs     (256K LSA offset)
+// D-form(2): [r+I7]  (7-bit signed offset + reg)
+
+def dform_addr   : ComplexPattern<iPTR, 2, "SelectDFormAddr",
+                                  [], [SDNPWantRoot]>;
+def xform_addr   : ComplexPattern<iPTR, 2, "SelectXFormAddr",
+                                  [], [SDNPWantRoot]>;
+def aform_addr   : ComplexPattern<iPTR, 2, "SelectAFormAddr",
+                                  [], [SDNPWantRoot]>;
+def dform2_addr  : ComplexPattern<iPTR, 2, "SelectDForm2Addr",
+                                  [], [SDNPWantRoot]>;
diff --git a/contrib/llvm/lib/Target/CellSPU/SPURegisterInfo.cpp b/contrib/llvm/lib/Target/CellSPU/SPURegisterInfo.cpp
new file mode 100644
index 000000000000..1b2da5f50c81
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -0,0 +1,356 @@
+//===-- SPURegisterInfo.cpp - Cell SPU Register Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Cell implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "SPURegisterInfo.h"
+#include "SPU.h"
+#include "SPUInstrBuilder.h"
+#include "SPUSubtarget.h"
+#include "SPUMachineFunction.h"
+#include "SPUFrameLowering.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cstdlib>
+
+#define GET_REGINFO_TARGET_DESC
+#include "SPUGenRegisterInfo.inc"
+
+using namespace llvm;
+
+/// getRegisterNumbering - Given the enum value for some register, e.g.
+/// PPC::F14, return the number that it corresponds to (e.g. 14).
+unsigned SPURegisterInfo::getRegisterNumbering(unsigned RegEnum) {
+  using namespace SPU;
+  switch (RegEnum) {
+  case SPU::R0: return 0;
+  case SPU::R1: return 1;
+  case SPU::R2: return 2;
+  case SPU::R3: return 3;
+  case SPU::R4: return 4;
+  case SPU::R5: return 5;
+  case SPU::R6: return 6;
+  case SPU::R7: return 7;
+  case SPU::R8: return 8;
+  case SPU::R9: return 9;
+  case SPU::R10: return 10;
+  case SPU::R11: return 11;
+  case SPU::R12: return 12;
+  case SPU::R13: return 13;
+  case SPU::R14: return 14;
+  case SPU::R15: return 15;
+  case SPU::R16: return 16;
+  case SPU::R17: return 17;
+  case SPU::R18: return 18;
+  case SPU::R19: return 19;
+  case SPU::R20: return 20;
+  case SPU::R21: return 21;
+  case SPU::R22: return 22;
+  case SPU::R23: return 23;
+  case SPU::R24: return 24;
+  case SPU::R25: return 25;
+  case SPU::R26: return 26;
+  case SPU::R27: return 27;
+  case SPU::R28: return 28;
+  case SPU::R29: return 29;
+  case SPU::R30: return 30;
+  case SPU::R31: return 31;
+  case SPU::R32: return 32;
+  case SPU::R33: return 33;
+  case SPU::R34: return 34;
+  case SPU::R35: return 35;
+  case SPU::R36: return 36;
+  case SPU::R37: return 37;
+  case SPU::R38: return 38;
+  case SPU::R39: return 39;
+  case SPU::R40: return 40;
+  case SPU::R41: return 41;
+  case SPU::R42: return 42;
+  case SPU::R43: return 43;
+  case SPU::R44: return 44;
+  case SPU::R45: return 45;
+  case SPU::R46: return 46;
+  case SPU::R47: return 47;
+  case SPU::R48: return 48;
+  case SPU::R49: return 49;
+  case SPU::R50: return 50;
+  case SPU::R51: return 51;
+  case SPU::R52: return 52;
+  case SPU::R53: return 53;
+  case SPU::R54: return 54;
+  case SPU::R55: return 55;
+  case SPU::R56: return 56;
+  case SPU::R57: return 57;
+  case SPU::R58: return 58;
+  case SPU::R59: return 59;
+  case SPU::R60: return 60;
+  case SPU::R61: return 61;
+  case SPU::R62: return 62;
+  case SPU::R63: return 63;
+  case SPU::R64: return 64;
+  case SPU::R65: return 65;
+  case SPU::R66: return 66;
+  case SPU::R67: return 67;
+  case SPU::R68: return 68;
+  case SPU::R69: return 69;
+  case SPU::R70: return 70;
+  case SPU::R71: return 71;
+  case SPU::R72: return 72;
+  case SPU::R73: return 73;
+  case SPU::R74: return 74;
+  case SPU::R75: return 75;
+  case SPU::R76: return 76;
+  case SPU::R77: return 77;
+  case SPU::R78: return 78;
+  case SPU::R79: return 79;
+  case SPU::R80: return 80;
+  case SPU::R81: return 81;
+  case SPU::R82: return 82;
+  case SPU::R83: return 83;
+  case SPU::R84: return 84;
+  case SPU::R85: return 85;
+  case SPU::R86: return 86;
+  case SPU::R87: return 87;
+  case SPU::R88: return 88;
+  case SPU::R89: return 89;
+  case SPU::R90: return 90;
+  case SPU::R91: return 91;
+  case SPU::R92: return 92;
+  case SPU::R93: return 93;
+  case SPU::R94: return 94;
+  case SPU::R95: return 95;
+  case SPU::R96: return 96;
+  case SPU::R97: return 97;
+  case SPU::R98: return 98;
+  case SPU::R99: return 99;
+  case SPU::R100: return 100;
+  case SPU::R101: return 101;
+  case SPU::R102: return 102;
+  case SPU::R103: return 103;
+  case SPU::R104: return 104;
+  case SPU::R105: return 105;
+  case SPU::R106: return 106;
+  case SPU::R107: return 107;
+  case SPU::R108: return 108;
+  case SPU::R109: return 109;
+  case SPU::R110: return 110;
+  case SPU::R111: return 111;
+  case SPU::R112: return 112;
+  case SPU::R113: return 113;
+  case SPU::R114: return 114;
+  case SPU::R115: return 115;
+  case SPU::R116: return 116;
+  case SPU::R117: return 117;
+  case SPU::R118: return 118;
+  case SPU::R119: return 119;
+  case SPU::R120: return 120;
+  case SPU::R121: return 121;
+  case SPU::R122: return 122;
+  case SPU::R123: return 123;
+  case SPU::R124: return 124;
+  case SPU::R125: return 125;
+  case SPU::R126: return 126;
+  case SPU::R127: return 127;
+  default:
+    report_fatal_error("Unhandled reg in SPURegisterInfo::getRegisterNumbering");
+  }
+}
+
+SPURegisterInfo::SPURegisterInfo(const SPUSubtarget &subtarget,
+                                 const TargetInstrInfo &tii) :
+  SPUGenRegisterInfo(SPU::R0), Subtarget(subtarget), TII(tii)
+{
+}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass *
+SPURegisterInfo::getPointerRegClass(unsigned Kind) const {
+  return &SPU::R32CRegClass;
+}
+
+const uint16_t *
+SPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const
+{
+  // Cell ABI calling convention
+  static const uint16_t SPU_CalleeSaveRegs[] = {
+    SPU::R80, SPU::R81, SPU::R82, SPU::R83,
+    SPU::R84, SPU::R85, SPU::R86, SPU::R87,
+    SPU::R88, SPU::R89, SPU::R90, SPU::R91,
+    SPU::R92, SPU::R93, SPU::R94, SPU::R95,
+    SPU::R96, SPU::R97, SPU::R98, SPU::R99,
+    SPU::R100, SPU::R101, SPU::R102, SPU::R103,
+    SPU::R104, SPU::R105, SPU::R106, SPU::R107,
+    SPU::R108, SPU::R109, SPU::R110, SPU::R111,
+    SPU::R112, SPU::R113, SPU::R114, SPU::R115,
+    SPU::R116, SPU::R117, SPU::R118, SPU::R119,
+    SPU::R120, SPU::R121, SPU::R122, SPU::R123,
+    SPU::R124, SPU::R125, SPU::R126, SPU::R127,
+    SPU::R2,    /* environment pointer */
+    SPU::R1,    /* stack pointer */
+    SPU::R0,    /* link register */
+    0 /* end */
+  };
+
+  return SPU_CalleeSaveRegs;
+}
+
+/*!
+ R0 (link register), R1 (stack pointer) and R2 (environment pointer -- this is
+ generally unused) are the Cell's reserved registers
+ */
+BitVector SPURegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(SPU::R0);                // LR
+  Reserved.set(SPU::R1);                // SP
+  Reserved.set(SPU::R2);                // environment pointer
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+//--------------------------------------------------------------------------
+void
+SPURegisterInfo::eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator I)
+  const
+{
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+void
+SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                                     RegScavenger *RS) const
+{
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = II->getDebugLoc();
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  MachineOperand &SPOp = MI.getOperand(i);
+  int FrameIndex = SPOp.getIndex();
+
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MFI->getObjectOffset(FrameIndex);
+
+  // Most instructions, except for generated FrameIndex additions using AIr32
+  // and ILAr32, have the immediate in operand 1. AIr32 and ILAr32 have the
+  // immediate in operand 2.
+  unsigned OpNo = 1;
+  if (MI.getOpcode() == SPU::AIr32 || MI.getOpcode() == SPU::ILAr32)
+    OpNo = 2;
+
+  MachineOperand &MO = MI.getOperand(OpNo);
+
+  // Offset is biased by $lr's slot at the bottom.
+  Offset += MO.getImm() + MFI->getStackSize() + SPUFrameLowering::minStackSize();
+  assert((Offset & 0xf) == 0
+         && "16-byte alignment violated in eliminateFrameIndex");
+
+  // Replace the FrameIndex with base register with $sp (aka $r1)
+  SPOp.ChangeToRegister(SPU::R1, false);
+
+  // if 'Offset' doesn't fit to the D-form instruction's
+  // immediate, convert the instruction to X-form
+  // if the instruction is not an AI (which takes a s10 immediate), assume
+  // it is a load/store that can take a s14 immediate
+  if ((MI.getOpcode() == SPU::AIr32 && !isInt<10>(Offset))
+      || !isInt<14>(Offset)) {
+    int newOpcode = convertDFormToXForm(MI.getOpcode());
+    unsigned tmpReg = findScratchRegister(II, RS, &SPU::R32CRegClass, SPAdj);
+    BuildMI(MBB, II, dl, TII.get(SPU::ILr32), tmpReg )
+        .addImm(Offset);
+    BuildMI(MBB, II, dl, TII.get(newOpcode), MI.getOperand(0).getReg())
+        .addReg(tmpReg, RegState::Kill)
+        .addReg(SPU::R1);
+    // remove the replaced D-form instruction
+    MBB.erase(II);
+  } else {
+    MO.ChangeToImmediate(Offset);
+  }
+}
+
+unsigned
+SPURegisterInfo::getFrameRegister(const MachineFunction &MF) const
+{
+  return SPU::R1;
+}
+
+int
+SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const
+{
+  switch(dFormOpcode)
+  {
+    case SPU::AIr32:     return SPU::Ar32;
+    case SPU::LQDr32:    return SPU::LQXr32;
+    case SPU::LQDr128:   return SPU::LQXr128;
+    case SPU::LQDv16i8:  return SPU::LQXv16i8;
+    case SPU::LQDv4i32:  return SPU::LQXv4i32;
+    case SPU::LQDv4f32:  return SPU::LQXv4f32;
+    case SPU::STQDr32:   return SPU::STQXr32;
+    case SPU::STQDr128:  return SPU::STQXr128;
+    case SPU::STQDv16i8: return SPU::STQXv16i8;
+    case SPU::STQDv4i32: return SPU::STQXv4i32;
+    case SPU::STQDv4f32: return SPU::STQXv4f32;
+
+    default: assert( false && "Unhandled D to X-form conversion");
+  }
+  // default will assert, but need to return something to keep the
+  // compiler happy.
+  return dFormOpcode;
+}
+
+// TODO this is already copied from PPC. Could this convenience function
+// be moved to the RegScavenger class?
+unsigned
+SPURegisterInfo::findScratchRegister(MachineBasicBlock::iterator II,
+                                     RegScavenger *RS,
+                                     const TargetRegisterClass *RC,
+                                     int SPAdj) const
+{
+  assert(RS && "Register scavenging must be on");
+  unsigned Reg = RS->FindUnusedReg(RC);
+  if (Reg == 0)
+    Reg = RS->scavengeRegister(RC, II, SPAdj);
+  assert( Reg && "Register scavenger failed");
+  return Reg;
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPURegisterInfo.h b/contrib/llvm/lib/Target/CellSPU/SPURegisterInfo.h
new file mode 100644
index 000000000000..e5ab22422502
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPURegisterInfo.h
@@ -0,0 +1,101 @@
+//===-- SPURegisterInfo.h - Cell SPU Register Information Impl --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Cell SPU implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_REGISTERINFO_H
+#define SPU_REGISTERINFO_H
+
+#include "SPU.h"
+
+#define GET_REGINFO_HEADER
+#include "SPUGenRegisterInfo.inc"
+
+namespace llvm {
+  class SPUSubtarget;
+  class TargetInstrInfo;
+  class Type;
+
+  class SPURegisterInfo : public SPUGenRegisterInfo {
+  private:
+    const SPUSubtarget &Subtarget;
+    const TargetInstrInfo &TII;
+
+    //! Predicate: Does the machine function use the link register?
+    bool usesLR(MachineFunction &MF) const;
+
+  public:
+    SPURegisterInfo(const SPUSubtarget &subtarget, const TargetInstrInfo &tii);
+ 
+    //! Translate a register's enum value to a register number
+    /*!
+      This method translates a register's enum value to it's regiser number,
+      e.g. SPU::R14 -> 14.
+     */
+    static unsigned getRegisterNumbering(unsigned RegEnum);
+
+    /// getPointerRegClass - Return the register class to use to hold pointers.
+    /// This is used for addressing modes.
+    virtual const TargetRegisterClass *
+    getPointerRegClass(unsigned Kind = 0) const;
+
+    /// After allocating this many registers, the allocator should feel
+    /// register pressure. The value is a somewhat random guess, based on the
+    /// number of non callee saved registers in the C calling convention.
+    virtual unsigned getRegPressureLimit( const TargetRegisterClass *RC,
+                                          MachineFunction &MF) const{
+      return 50;
+    }
+
+    //! Return the array of callee-saved registers
+    virtual const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
+
+    //! Allow for scavenging, so we can get scratch registers when needed.
+    virtual bool requiresRegisterScavenging(const MachineFunction &MF) const
+    { return true; }
+
+    //! Return the reserved registers
+    BitVector getReservedRegs(const MachineFunction &MF) const;
+
+    //! Eliminate the call frame setup pseudo-instructions
+    void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const;
+    //! Convert frame indicies into machine operands
+    void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                             RegScavenger *RS = NULL) const;
+
+    //! Get the stack frame register (SP, aka R1)
+    unsigned getFrameRegister(const MachineFunction &MF) const;
+
+    //------------------------------------------------------------------------
+    // New methods added:
+    //------------------------------------------------------------------------
+
+    //! Convert D-form load/store to X-form load/store
+    /*!
+      Converts a regiser displacement load/store into a register-indexed
+      load/store for large stack frames, when the stack frame exceeds the
+      range of a s10 displacement.
+     */
+    int convertDFormToXForm(int dFormOpcode) const;
+
+    //! Acquire an unused register in an emergency.
+    unsigned findScratchRegister(MachineBasicBlock::iterator II,
+                                 RegScavenger *RS,
+                                 const TargetRegisterClass *RC, 
+                                 int SPAdj) const;
+    
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPURegisterInfo.td b/contrib/llvm/lib/Target/CellSPU/SPURegisterInfo.td
new file mode 100644
index 000000000000..f27b042edd63
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPURegisterInfo.td
@@ -0,0 +1,183 @@
+//===-- SPURegisterInfo.td - The Cell SPU Register File ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+class SPUReg<string n> : Register<n> {
+  let Namespace = "SPU";
+}
+
+// The SPU's register are all 128-bits wide, which makes specifying the
+// registers relatively easy, if relatively mundane:
+
+class SPUVecReg<bits<7> num, string n> : SPUReg<n> {
+  field bits<7> Num = num;
+}
+
+def R0 : SPUVecReg<0, "$lr">, DwarfRegNum<[0]>;
+def R1 : SPUVecReg<1, "$sp">, DwarfRegNum<[1]>;
+def R2 : SPUVecReg<2, "$2">, DwarfRegNum<[2]>;
+def R3 : SPUVecReg<3, "$3">, DwarfRegNum<[3]>;
+def R4 : SPUVecReg<4, "$4">, DwarfRegNum<[4]>;
+def R5 : SPUVecReg<5, "$5">, DwarfRegNum<[5]>;
+def R6 : SPUVecReg<6, "$6">, DwarfRegNum<[6]>;
+def R7 : SPUVecReg<7, "$7">, DwarfRegNum<[7]>;
+def R8 : SPUVecReg<8, "$8">, DwarfRegNum<[8]>;
+def R9 : SPUVecReg<9, "$9">, DwarfRegNum<[9]>;
+def R10 : SPUVecReg<10, "$10">, DwarfRegNum<[10]>;
+def R11 : SPUVecReg<11, "$11">, DwarfRegNum<[11]>;
+def R12 : SPUVecReg<12, "$12">, DwarfRegNum<[12]>;
+def R13 : SPUVecReg<13, "$13">, DwarfRegNum<[13]>;
+def R14 : SPUVecReg<14, "$14">, DwarfRegNum<[14]>;
+def R15 : SPUVecReg<15, "$15">, DwarfRegNum<[15]>;
+def R16 : SPUVecReg<16, "$16">, DwarfRegNum<[16]>;
+def R17 : SPUVecReg<17, "$17">, DwarfRegNum<[17]>;
+def R18 : SPUVecReg<18, "$18">, DwarfRegNum<[18]>;
+def R19 : SPUVecReg<19, "$19">, DwarfRegNum<[19]>;
+def R20 : SPUVecReg<20, "$20">, DwarfRegNum<[20]>;
+def R21 : SPUVecReg<21, "$21">, DwarfRegNum<[21]>;
+def R22 : SPUVecReg<22, "$22">, DwarfRegNum<[22]>;
+def R23 : SPUVecReg<23, "$23">, DwarfRegNum<[23]>;
+def R24 : SPUVecReg<24, "$24">, DwarfRegNum<[24]>;
+def R25 : SPUVecReg<25, "$25">, DwarfRegNum<[25]>;
+def R26 : SPUVecReg<26, "$26">, DwarfRegNum<[26]>;
+def R27 : SPUVecReg<27, "$27">, DwarfRegNum<[27]>;
+def R28 : SPUVecReg<28, "$28">, DwarfRegNum<[28]>;
+def R29 : SPUVecReg<29, "$29">, DwarfRegNum<[29]>;
+def R30 : SPUVecReg<30, "$30">, DwarfRegNum<[30]>;
+def R31 : SPUVecReg<31, "$31">, DwarfRegNum<[31]>;
+def R32 : SPUVecReg<32, "$32">, DwarfRegNum<[32]>;
+def R33 : SPUVecReg<33, "$33">, DwarfRegNum<[33]>;
+def R34 : SPUVecReg<34, "$34">, DwarfRegNum<[34]>;
+def R35 : SPUVecReg<35, "$35">, DwarfRegNum<[35]>;
+def R36 : SPUVecReg<36, "$36">, DwarfRegNum<[36]>;
+def R37 : SPUVecReg<37, "$37">, DwarfRegNum<[37]>;
+def R38 : SPUVecReg<38, "$38">, DwarfRegNum<[38]>;
+def R39 : SPUVecReg<39, "$39">, DwarfRegNum<[39]>;
+def R40 : SPUVecReg<40, "$40">, DwarfRegNum<[40]>;
+def R41 : SPUVecReg<41, "$41">, DwarfRegNum<[41]>;
+def R42 : SPUVecReg<42, "$42">, DwarfRegNum<[42]>;
+def R43 : SPUVecReg<43, "$43">, DwarfRegNum<[43]>;
+def R44 : SPUVecReg<44, "$44">, DwarfRegNum<[44]>;
+def R45 : SPUVecReg<45, "$45">, DwarfRegNum<[45]>;
+def R46 : SPUVecReg<46, "$46">, DwarfRegNum<[46]>;
+def R47 : SPUVecReg<47, "$47">, DwarfRegNum<[47]>;
+def R48 : SPUVecReg<48, "$48">, DwarfRegNum<[48]>;
+def R49 : SPUVecReg<49, "$49">, DwarfRegNum<[49]>;
+def R50 : SPUVecReg<50, "$50">, DwarfRegNum<[50]>;
+def R51 : SPUVecReg<51, "$51">, DwarfRegNum<[51]>;
+def R52 : SPUVecReg<52, "$52">, DwarfRegNum<[52]>;
+def R53 : SPUVecReg<53, "$53">, DwarfRegNum<[53]>;
+def R54 : SPUVecReg<54, "$54">, DwarfRegNum<[54]>;
+def R55 : SPUVecReg<55, "$55">, DwarfRegNum<[55]>;
+def R56 : SPUVecReg<56, "$56">, DwarfRegNum<[56]>;
+def R57 : SPUVecReg<57, "$57">, DwarfRegNum<[57]>;
+def R58 : SPUVecReg<58, "$58">, DwarfRegNum<[58]>;
+def R59 : SPUVecReg<59, "$59">, DwarfRegNum<[59]>;
+def R60 : SPUVecReg<60, "$60">, DwarfRegNum<[60]>;
+def R61 : SPUVecReg<61, "$61">, DwarfRegNum<[61]>;
+def R62 : SPUVecReg<62, "$62">, DwarfRegNum<[62]>;
+def R63 : SPUVecReg<63, "$63">, DwarfRegNum<[63]>;
+def R64 : SPUVecReg<64, "$64">, DwarfRegNum<[64]>;
+def R65 : SPUVecReg<65, "$65">, DwarfRegNum<[65]>;
+def R66 : SPUVecReg<66, "$66">, DwarfRegNum<[66]>;
+def R67 : SPUVecReg<67, "$67">, DwarfRegNum<[67]>;
+def R68 : SPUVecReg<68, "$68">, DwarfRegNum<[68]>;
+def R69 : SPUVecReg<69, "$69">, DwarfRegNum<[69]>;
+def R70 : SPUVecReg<70, "$70">, DwarfRegNum<[70]>;
+def R71 : SPUVecReg<71, "$71">, DwarfRegNum<[71]>;
+def R72 : SPUVecReg<72, "$72">, DwarfRegNum<[72]>;
+def R73 : SPUVecReg<73, "$73">, DwarfRegNum<[73]>;
+def R74 : SPUVecReg<74, "$74">, DwarfRegNum<[74]>;
+def R75 : SPUVecReg<75, "$75">, DwarfRegNum<[75]>;
+def R76 : SPUVecReg<76, "$76">, DwarfRegNum<[76]>;
+def R77 : SPUVecReg<77, "$77">, DwarfRegNum<[77]>;
+def R78 : SPUVecReg<78, "$78">, DwarfRegNum<[78]>;
+def R79 : SPUVecReg<79, "$79">, DwarfRegNum<[79]>;
+def R80 : SPUVecReg<80, "$80">, DwarfRegNum<[80]>;
+def R81 : SPUVecReg<81, "$81">, DwarfRegNum<[81]>;
+def R82 : SPUVecReg<82, "$82">, DwarfRegNum<[82]>;
+def R83 : SPUVecReg<83, "$83">, DwarfRegNum<[83]>;
+def R84 : SPUVecReg<84, "$84">, DwarfRegNum<[84]>;
+def R85 : SPUVecReg<85, "$85">, DwarfRegNum<[85]>;
+def R86 : SPUVecReg<86, "$86">, DwarfRegNum<[86]>;
+def R87 : SPUVecReg<87, "$87">, DwarfRegNum<[87]>;
+def R88 : SPUVecReg<88, "$88">, DwarfRegNum<[88]>;
+def R89 : SPUVecReg<89, "$89">, DwarfRegNum<[89]>;
+def R90 : SPUVecReg<90, "$90">, DwarfRegNum<[90]>;
+def R91 : SPUVecReg<91, "$91">, DwarfRegNum<[91]>;
+def R92 : SPUVecReg<92, "$92">, DwarfRegNum<[92]>;
+def R93 : SPUVecReg<93, "$93">, DwarfRegNum<[93]>;
+def R94 : SPUVecReg<94, "$94">, DwarfRegNum<[94]>;
+def R95 : SPUVecReg<95, "$95">, DwarfRegNum<[95]>;
+def R96 : SPUVecReg<96, "$96">, DwarfRegNum<[96]>;
+def R97 : SPUVecReg<97, "$97">, DwarfRegNum<[97]>;
+def R98 : SPUVecReg<98, "$98">, DwarfRegNum<[98]>;
+def R99 : SPUVecReg<99, "$99">, DwarfRegNum<[99]>;
+def R100 : SPUVecReg<100, "$100">, DwarfRegNum<[100]>;
+def R101 : SPUVecReg<101, "$101">, DwarfRegNum<[101]>;
+def R102 : SPUVecReg<102, "$102">, DwarfRegNum<[102]>;
+def R103 : SPUVecReg<103, "$103">, DwarfRegNum<[103]>;
+def R104 : SPUVecReg<104, "$104">, DwarfRegNum<[104]>;
+def R105 : SPUVecReg<105, "$105">, DwarfRegNum<[105]>;
+def R106 : SPUVecReg<106, "$106">, DwarfRegNum<[106]>;
+def R107 : SPUVecReg<107, "$107">, DwarfRegNum<[107]>;
+def R108 : SPUVecReg<108, "$108">, DwarfRegNum<[108]>;
+def R109 : SPUVecReg<109, "$109">, DwarfRegNum<[109]>;
+def R110 : SPUVecReg<110, "$110">, DwarfRegNum<[110]>;
+def R111 : SPUVecReg<111, "$111">, DwarfRegNum<[111]>;
+def R112 : SPUVecReg<112, "$112">, DwarfRegNum<[112]>;
+def R113 : SPUVecReg<113, "$113">, DwarfRegNum<[113]>;
+def R114 : SPUVecReg<114, "$114">, DwarfRegNum<[114]>;
+def R115 : SPUVecReg<115, "$115">, DwarfRegNum<[115]>;
+def R116 : SPUVecReg<116, "$116">, DwarfRegNum<[116]>;
+def R117 : SPUVecReg<117, "$117">, DwarfRegNum<[117]>;
+def R118 : SPUVecReg<118, "$118">, DwarfRegNum<[118]>;
+def R119 : SPUVecReg<119, "$119">, DwarfRegNum<[119]>;
+def R120 : SPUVecReg<120, "$120">, DwarfRegNum<[120]>;
+def R121 : SPUVecReg<121, "$121">, DwarfRegNum<[121]>;
+def R122 : SPUVecReg<122, "$122">, DwarfRegNum<[122]>;
+def R123 : SPUVecReg<123, "$123">, DwarfRegNum<[123]>;
+def R124 : SPUVecReg<124, "$124">, DwarfRegNum<[124]>;
+def R125 : SPUVecReg<125, "$125">, DwarfRegNum<[125]>;
+def R126 : SPUVecReg<126, "$126">, DwarfRegNum<[126]>;
+def R127 : SPUVecReg<127, "$127">, DwarfRegNum<[127]>;
+
+/* Need floating point status register here: */
+/* def FPCSR : ... */
+
+// The SPU's registers as 128-bit wide entities, and can function as general
+// purpose registers, where the operands are in the "preferred slot":
+// The non-volatile registers are allocated in reverse order, like PPC does it.
+def GPRC : RegisterClass<"SPU", [i128], 128,
+                         (add (sequence "R%u", 0, 79),
+                              (sequence "R%u", 127, 80))>;
+
+// The SPU's registers as 64-bit wide (double word integer) "preferred slot":
+def R64C : RegisterClass<"SPU", [i64], 128, (add GPRC)>;
+
+// The SPU's registers as 64-bit wide (double word) FP "preferred slot":
+def R64FP : RegisterClass<"SPU", [f64], 128, (add GPRC)>;
+
+// The SPU's registers as 32-bit wide (word) "preferred slot":
+def R32C : RegisterClass<"SPU", [i32], 128, (add GPRC)>;
+
+// The SPU's registers as single precision floating point "preferred slot":
+def R32FP : RegisterClass<"SPU", [f32], 128, (add GPRC)>;
+
+// The SPU's registers as 16-bit wide (halfword) "preferred slot":
+def R16C : RegisterClass<"SPU", [i16], 128, (add GPRC)>;
+
+// The SPU's registers as 8-bit wide (byte) "preferred slot":
+def R8C : RegisterClass<"SPU", [i8], 128, (add GPRC)>;
+
+// The SPU's registers as vector registers:
+def VECREG : RegisterClass<"SPU", [v16i8,v8i16,v4i32,v4f32,v2i64,v2f64], 128,
+                           (add GPRC)>;
diff --git a/contrib/llvm/lib/Target/CellSPU/SPURegisterNames.h b/contrib/llvm/lib/Target/CellSPU/SPURegisterNames.h
new file mode 100644
index 000000000000..e557ed340a28
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPURegisterNames.h
@@ -0,0 +1,19 @@
+//===- SPURegisterNames.h - Wrapper header for SPU register names -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_REGISTER_NAMES_H
+#define SPU_REGISTER_NAMES_H
+
+// Define symbolic names for Cell registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "SPUGenRegisterInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUSchedule.td b/contrib/llvm/lib/Target/CellSPU/SPUSchedule.td
new file mode 100644
index 000000000000..9ccd0844e48e
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUSchedule.td
@@ -0,0 +1,59 @@
+//===-- SPUSchedule.td - Cell Scheduling Definitions -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Even pipeline:
+
+def EVEN_UNIT : FuncUnit;       // Even execution unit: (PC & 0x7 == 000)
+def ODD_UNIT  : FuncUnit;       // Odd execution unit:  (PC & 0x7 == 100)
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for Cell SPU
+//===----------------------------------------------------------------------===//
+
+def LoadStore    : InstrItinClass;              // ODD_UNIT
+def BranchHints  : InstrItinClass;              // ODD_UNIT
+def BranchResolv : InstrItinClass;              // ODD_UNIT
+def ChanOpSPR    : InstrItinClass;              // ODD_UNIT
+def ShuffleOp    : InstrItinClass;              // ODD_UNIT
+def SelectOp     : InstrItinClass;              // ODD_UNIT
+def GatherOp     : InstrItinClass;              // ODD_UNIT
+def LoadNOP      : InstrItinClass;              // ODD_UNIT
+def ExecNOP      : InstrItinClass;              // EVEN_UNIT
+def SPrecFP      : InstrItinClass;              // EVEN_UNIT
+def DPrecFP      : InstrItinClass;              // EVEN_UNIT
+def FPInt        : InstrItinClass;              // EVEN_UNIT (FP<->integer)
+def ByteOp       : InstrItinClass;              // EVEN_UNIT
+def IntegerOp    : InstrItinClass;              // EVEN_UNIT
+def IntegerMulDiv: InstrItinClass;              // EVEN_UNIT
+def RotShiftVec  : InstrItinClass;              // EVEN_UNIT Inter vector
+def RotShiftQuad : InstrItinClass;              // ODD_UNIT Entire quad
+def ImmLoad      : InstrItinClass;              // EVEN_UNIT
+
+/* Note: The itinerary for the Cell SPU is somewhat contrived... */
+def SPUItineraries : ProcessorItineraries<[ODD_UNIT, EVEN_UNIT], [], [
+  InstrItinData<LoadStore   , [InstrStage<6,  [ODD_UNIT]>]>,
+  InstrItinData<BranchHints , [InstrStage<6,  [ODD_UNIT]>]>,
+  InstrItinData<BranchResolv, [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<ChanOpSPR   , [InstrStage<6,  [ODD_UNIT]>]>,
+  InstrItinData<ShuffleOp   , [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<SelectOp    , [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<GatherOp    , [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<LoadNOP     , [InstrStage<1,  [ODD_UNIT]>]>,
+  InstrItinData<ExecNOP     , [InstrStage<1,  [EVEN_UNIT]>]>,
+  InstrItinData<SPrecFP     , [InstrStage<6,  [EVEN_UNIT]>]>,
+  InstrItinData<DPrecFP     , [InstrStage<13, [EVEN_UNIT]>]>,
+  InstrItinData<FPInt       , [InstrStage<2,  [EVEN_UNIT]>]>,
+  InstrItinData<ByteOp      , [InstrStage<4,  [EVEN_UNIT]>]>,
+  InstrItinData<IntegerOp   , [InstrStage<2,  [EVEN_UNIT]>]>,
+  InstrItinData<RotShiftVec , [InstrStage<4,  [EVEN_UNIT]>]>, 
+  InstrItinData<RotShiftQuad, [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<IntegerMulDiv,[InstrStage<7,  [EVEN_UNIT]>]>,
+  InstrItinData<ImmLoad     , [InstrStage<2,  [EVEN_UNIT]>]>
+  ]>;
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/CellSPU/SPUSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..5732fd43cdc2
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUSelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- SPUSelectionDAGInfo.cpp - CellSPU SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPUSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "cellspu-selectiondag-info"
+#include "SPUTargetMachine.h"
+using namespace llvm;
+
+SPUSelectionDAGInfo::SPUSelectionDAGInfo(const SPUTargetMachine &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+SPUSelectionDAGInfo::~SPUSelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUSelectionDAGInfo.h b/contrib/llvm/lib/Target/CellSPU/SPUSelectionDAGInfo.h
new file mode 100644
index 000000000000..39257d92c400
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUSelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- SPUSelectionDAGInfo.h - CellSPU SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the CellSPU subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CELLSPUSELECTIONDAGINFO_H
+#define CELLSPUSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class SPUTargetMachine;
+
+class SPUSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit SPUSelectionDAGInfo(const SPUTargetMachine &TM);
+  ~SPUSelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUSubtarget.cpp b/contrib/llvm/lib/Target/CellSPU/SPUSubtarget.cpp
new file mode 100644
index 000000000000..eec2d250be7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUSubtarget.cpp
@@ -0,0 +1,65 @@
+//===-- SPUSubtarget.cpp - STI Cell SPU Subtarget Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CellSPU-specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUSubtarget.h"
+#include "SPU.h"
+#include "SPURegisterInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SPUGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+SPUSubtarget::SPUSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS) :
+  SPUGenSubtargetInfo(TT, CPU, FS),
+  StackAlignment(16),
+  ProcDirective(SPU::DEFAULT_PROC),
+  UseLargeMem(false)
+{
+  // Should be the target SPU processor type. For now, since there's only
+  // one, simply default to the current "v0" default:
+  std::string default_cpu("v0");
+
+  // Parse features string.
+  ParseSubtargetFeatures(default_cpu, FS);
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(default_cpu);
+}
+
+/// SetJITMode - This is called to inform the subtarget info that we are
+/// producing code for the JIT.
+void SPUSubtarget::SetJITMode() {
+}
+
+/// Enable PostRA scheduling for optimization levels -O2 and -O3.
+bool SPUSubtarget::enablePostRAScheduler(
+                       CodeGenOpt::Level OptLevel,
+                       TargetSubtargetInfo::AntiDepBreakMode& Mode,
+                       RegClassVector& CriticalPathRCs) const {
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  // CriticalPathsRCs seems to be the set of
+  // RegisterClasses that antidep breakings are performed for.
+  // Do it for all register classes 
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(&SPU::R8CRegClass);
+  CriticalPathRCs.push_back(&SPU::R16CRegClass);
+  CriticalPathRCs.push_back(&SPU::R32CRegClass);
+  CriticalPathRCs.push_back(&SPU::R32FPRegClass);
+  CriticalPathRCs.push_back(&SPU::R64CRegClass);
+  CriticalPathRCs.push_back(&SPU::VECREGRegClass);
+  return OptLevel >= CodeGenOpt::Default;
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUSubtarget.h b/contrib/llvm/lib/Target/CellSPU/SPUSubtarget.h
new file mode 100644
index 000000000000..7c4aa1430217
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUSubtarget.h
@@ -0,0 +1,97 @@
+//===-- SPUSubtarget.h - Define Subtarget for the Cell SPU ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Cell SPU-specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CELLSUBTARGET_H
+#define CELLSUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SPUGenSubtargetInfo.inc"
+
+namespace llvm {
+  class GlobalValue;
+  class StringRef;
+
+  namespace SPU {
+    enum {
+      PROC_NONE,
+      DEFAULT_PROC
+    };
+  }
+    
+  class SPUSubtarget : public SPUGenSubtargetInfo {
+  protected:
+    /// stackAlignment - The minimum alignment known to hold of the stack frame
+    /// on entry to the function and which must be maintained by every function.
+    unsigned StackAlignment;
+    
+    /// Selected instruction itineraries (one entry per itinerary class.)
+    InstrItineraryData InstrItins;
+
+    /// Which SPU processor (this isn't really used, but it's there to keep
+    /// the C compiler happy)
+    unsigned ProcDirective;
+
+    /// Use (assume) large memory -- effectively disables the LQA/STQA
+    /// instructions that assume 259K local store.
+    bool UseLargeMem;
+    
+  public:
+    /// This constructor initializes the data members to match that
+    /// of the specified triple.
+    ///
+    SPUSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS);
+    
+    /// ParseSubtargetFeatures - Parses features string setting specified 
+    /// subtarget options.  Definition of function is auto generated by tblgen.
+    void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+    /// SetJITMode - This is called to inform the subtarget info that we are
+    /// producing code for the JIT.
+    void SetJITMode();
+
+    /// getStackAlignment - Returns the minimum alignment known to hold of the
+    /// stack frame on entry to the function and which must be maintained by
+    /// every function for this subtarget.
+    unsigned getStackAlignment() const { return StackAlignment; }
+    
+    /// getInstrItins - Return the instruction itineraies based on subtarget 
+    /// selection.
+    const InstrItineraryData &getInstrItineraryData() const {
+      return InstrItins;
+    }
+
+    /// Use large memory addressing predicate
+    bool usingLargeMem() const {
+      return UseLargeMem;
+    }
+
+    /// getTargetDataString - Return the pointer size and type alignment
+    /// properties of this subtarget.
+    const char *getTargetDataString() const {
+      return "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128"
+             "-i16:16:128-i8:8:128-i1:8:128-a:0:128-v64:64:128-v128:128:128"
+             "-s:128:128-n32:64";
+    }
+
+    bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                               TargetSubtargetInfo::AntiDepBreakMode& Mode,
+                               RegClassVector& CriticalPathRCs) const;
+  };
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUTargetMachine.cpp b/contrib/llvm/lib/Target/CellSPU/SPUTargetMachine.cpp
new file mode 100644
index 000000000000..21f6b25bf256
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -0,0 +1,93 @@
+//===-- SPUTargetMachine.cpp - Define TargetMachine for Cell SPU ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the Cell SPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUTargetMachine.h"
+#include "SPU.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeCellSPUTarget() {
+  // Register the target.
+  RegisterTargetMachine<SPUTargetMachine> X(TheCellSPUTarget);
+}
+
+const std::pair<unsigned, int> *
+SPUFrameLowering::getCalleeSaveSpillSlots(unsigned &NumEntries) const {
+  NumEntries = 1;
+  return &LR[0];
+}
+
+SPUTargetMachine::SPUTargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS),
+    DataLayout(Subtarget.getTargetDataString()),
+    InstrInfo(*this),
+    FrameLowering(Subtarget),
+    TLInfo(*this),
+    TSInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()) {
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// SPU Code Generator Pass Configuration Options.
+class SPUPassConfig : public TargetPassConfig {
+public:
+  SPUPassConfig(SPUTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  SPUTargetMachine &getSPUTargetMachine() const {
+    return getTM<SPUTargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *SPUTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new SPUPassConfig(this, PM);
+}
+
+bool SPUPassConfig::addInstSelector() {
+  // Install an instruction selector.
+  PM.add(createSPUISelDag(getSPUTargetMachine()));
+  return false;
+}
+
+// passes to run just before printing the assembly
+bool SPUPassConfig::addPreEmitPass() {
+  // load the TCE instruction scheduler, if available via
+  // loaded plugins
+  typedef llvm::FunctionPass* (*BuilderFunc)(const char*);
+  BuilderFunc schedulerCreator =
+    (BuilderFunc)(intptr_t)sys::DynamicLibrary::SearchForAddressOfSymbol(
+          "createTCESchedulerPass");
+  if (schedulerCreator != NULL)
+      PM.add(schedulerCreator("cellspu"));
+
+  //align instructions with nops/lnops for dual issue
+  PM.add(createSPUNopFillerPass(getSPUTargetMachine()));
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/CellSPU/SPUTargetMachine.h b/contrib/llvm/lib/Target/CellSPU/SPUTargetMachine.h
new file mode 100644
index 000000000000..3e5d38c919c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/SPUTargetMachine.h
@@ -0,0 +1,87 @@
+//===-- SPUTargetMachine.h - Define TargetMachine for Cell SPU --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CellSPU-specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_TARGETMACHINE_H
+#define SPU_TARGETMACHINE_H
+
+#include "SPUSubtarget.h"
+#include "SPUInstrInfo.h"
+#include "SPUISelLowering.h"
+#include "SPUSelectionDAGInfo.h"
+#include "SPUFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+/// SPUTargetMachine
+///
+class SPUTargetMachine : public LLVMTargetMachine {
+  SPUSubtarget        Subtarget;
+  const TargetData    DataLayout;
+  SPUInstrInfo        InstrInfo;
+  SPUFrameLowering    FrameLowering;
+  SPUTargetLowering   TLInfo;
+  SPUSelectionDAGInfo TSInfo;
+  InstrItineraryData  InstrItins;
+public:
+  SPUTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
+
+  /// Return the subtarget implementation object
+  virtual const SPUSubtarget     *getSubtargetImpl() const {
+    return &Subtarget;
+  }
+  virtual const SPUInstrInfo     *getInstrInfo() const {
+    return &InstrInfo;
+  }
+  virtual const SPUFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  /*!
+    \note Cell SPU does not support JIT today. It could support JIT at some
+    point.
+   */
+  virtual       TargetJITInfo    *getJITInfo() {
+    return NULL;
+  }
+
+  virtual const SPUTargetLowering *getTargetLowering() const {
+   return &TLInfo;
+  }
+
+  virtual const SPUSelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  virtual const SPURegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual const TargetData *getTargetData() const {
+    return &DataLayout;
+  }
+
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return &InstrItins;
+  }
+
+  // Pass Pipeline Configuration
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp b/contrib/llvm/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
new file mode 100644
index 000000000000..84aadfad6f8d
--- /dev/null
+++ b/contrib/llvm/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- CellSPUTargetInfo.cpp - CellSPU Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "llvm/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheCellSPUTarget;
+
+extern "C" void LLVMInitializeCellSPUTargetInfo() { 
+  RegisterTarget<Triple::cellspu> 
+    X(TheCellSPUTarget, "cellspu", "STI CBEA Cell SPU [experimental]");
+}