[CI-NOTIFY]: TCWG Bisect tcwg_bmk_tx1/llvm-release-aarch64-spec2k6-Oz - Build # 8 - Fixed! - linaro-toolchain

2 Jul 2021

Successfully identified regression in *llvm* in CI configuration tcwg_bmk_llvm_tx1/llvm-release-aarch64-spec2k6-Oz.  So far, this commit has regressed CI configurations:
 - tcwg_bmk_llvm_tx1/llvm-release-aarch64-spec2k6-Oz
Culprit:
<cut>
commit 4b1120159274ca98b4a58064b116aba88372ac98
Author: Michael Liao michael.hliao@gmail.com
Date:   Mon Oct 12 10:01:40 2020 -0400
[MachineInstr] Add support for instructions with multiple memory operands.
- Basically iterate each pair of memory operands from both instructions
      and return true if any of them may alias.
    - The exception are memory instructions without any memory operand. They
      may touch everything and could alias to any memory instruction.
Differential Revision: https://reviews.llvm.org/D89447
</cut>
Results regressed to (for first_bad == 4b1120159274ca98b4a58064b116aba88372ac98)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -Oz -- artifacts/build-4b1120159274ca98b4a58064b116aba88372ac98/results_id:
1
# 445.gobmk,[.] do_play_move                                    regressed by 111
from (for last_good == cb9d0e8819ad7ca2f349a57d62ea2af02d631dfa)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -Oz -- artifacts/build-cb9d0e8819ad7ca2f349a57d62ea2af02d631dfa/results_id:
1
Artifacts of last_good build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release-...
Results ID of last_good: tx1_64/tcwg_bmk_llvm_tx1/bisect-llvm-release-aarch64-spec2k6-Oz/744
Artifacts of first_bad build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release-...
Results ID of first_bad: tx1_64/tcwg_bmk_llvm_tx1/bisect-llvm-release-aarch64-spec2k6-Oz/734
Build top page/logs: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release-...
Configuration details:
Reproduce builds:
<cut>
mkdir investigate-llvm-4b1120159274ca98b4a58064b116aba88372ac98
cd investigate-llvm-4b1120159274ca98b4a58064b116aba88372ac98
git clone https://git.linaro.org/toolchain/jenkins-scripts
mkdir -p artifacts/manifests
curl -o artifacts/manifests/build-baseline.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release-... --fail
curl -o artifacts/manifests/build-parameters.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release-... --fail
curl -o artifacts/test.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release-... --fail
chmod +x artifacts/test.sh
# Reproduce the baseline build (build all pre-requisites)
./jenkins-scripts/tcwg_bmk-build.sh @@ artifacts/manifests/build-baseline.sh
cd llvm
# Reproduce first_bad build
git checkout --detach 4b1120159274ca98b4a58064b116aba88372ac98
../artifacts/test.sh
# Reproduce last_good build
git checkout --detach cb9d0e8819ad7ca2f349a57d62ea2af02d631dfa
../artifacts/test.sh
cd ..
</cut>
History of pending regressions and results: https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/c...
Artifacts: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release-...
Build log: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release-...
Full commit (up to 1000 lines):
<cut>
commit 4b1120159274ca98b4a58064b116aba88372ac98
Author: Michael Liao michael.hliao@gmail.com
Date:   Mon Oct 12 10:01:40 2020 -0400
[MachineInstr] Add support for instructions with multiple memory operands.
- Basically iterate each pair of memory operands from both instructions
      and return true if any of them may alias.
    - The exception are memory instructions without any memory operand. They
      may touch everything and could alias to any memory instruction.
Differential Revision: https://reviews.llvm.org/D89447
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h        |  15 +++
 llvm/lib/CodeGen/MachineInstr.cpp                  | 145 ++++++++++++---------
 .../test/CodeGen/AArch64/merge-store-dependency.ll |   4 +-
 .../CodeGen/ARM/big-endian-neon-fp16-bitconv.ll    |   4 +-
 llvm/test/CodeGen/Thumb2/mve-float32regloops.ll    |   8 +-
 llvm/test/CodeGen/Thumb2/mve-phireg.ll             |  10 +-
 llvm/test/CodeGen/Thumb2/mve-vst3.ll               |   2 +-
 .../Thumb2/umulo-128-legalisation-lowering.ll      |   8 +-
 llvm/test/CodeGen/X86/store_op_load_fold2.ll       |  11 +-
 9 files changed, 118 insertions(+), 89 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 5d5a6efab220..68fc129cc0ed 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1737,6 +1737,21 @@ public:
     return 5;
   }
+  /// Return the maximal number of alias checks on memory operands. For
+  /// instructions with more than one memory operands, the alias check on a
+  /// single MachineInstr pair has quadratic overhead and results in
+  /// unacceptable performance in the worst case. The limit here is to clamp
+  /// that maximal checks performed. Usually, that's the product of memory
+  /// operand numbers from that pair of MachineInstr to be checked. For
+  /// instance, with two MachineInstrs with 4 and 5 memory operands
+  /// correspondingly, a total of 20 checks are required. With this limit set to
+  /// 16, their alias check is skipped. We choose to limit the product instead
+  /// of the individual instruction as targets may have special MachineInstrs
+  /// with a considerably high number of memory operands, such as `ldm` in ARM.
+  /// Setting this limit per MachineInstr would result in either too high
+  /// overhead or too rigid restriction.
+  virtual unsigned getMemOperandAACheckLimit() const { return 16; }
+
   /// Return an array that contains the ids of the target indices (used for the
   /// TargetIndex machine operand) and their names.
   ///
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index d45c53b81816..fd658cdb41b9 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1276,81 +1276,96 @@ bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
   if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
     return false;
-  // FIXME: Need to handle multiple memory operands to support all targets.
-  if (!hasOneMemOperand() || !Other.hasOneMemOperand())
+  // Memory operations without memory operands may access anything. Be
+  // conservative and assume `MayAlias`.
+  if (memoperands_empty() || Other.memoperands_empty())
     return true;
-  MachineMemOperand *MMOa = *memoperands_begin();
-  MachineMemOperand *MMOb = *Other.memoperands_begin();
-
-  // The following interface to AA is fashioned after DAGCombiner::isAlias
-  // and operates with MachineMemOperand offset with some important
-  // assumptions:
-  //   - LLVM fundamentally assumes flat address spaces.
-  //   - MachineOperand offset can *only* result from legalization and
-  //     cannot affect queries other than the trivial case of overlap
-  //     checking.
-  //   - These offsets never wrap and never step outside
-  //     of allocated objects.
-  //   - There should never be any negative offsets here.
-  //
-  // FIXME: Modify API to hide this math from "user"
-  // Even before we go to AA we can reason locally about some
-  // memory objects. It can save compile time, and possibly catch some
-  // corner cases not currently covered.
-
-  int64_t OffsetA = MMOa->getOffset();
-  int64_t OffsetB = MMOb->getOffset();
-  int64_t MinOffset = std::min(OffsetA, OffsetB);
-
-  uint64_t WidthA = MMOa->getSize();
-  uint64_t WidthB = MMOb->getSize();
-  bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
-  bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
-
-  const Value *ValA = MMOa->getValue();
-  const Value *ValB = MMOb->getValue();
-  bool SameVal = (ValA && ValB && (ValA == ValB));
-  if (!SameVal) {
-    const PseudoSourceValue *PSVa = MMOa->getPseudoValue();
-    const PseudoSourceValue *PSVb = MMOb->getPseudoValue();
-    if (PSVa && ValB && !PSVa->mayAlias(&MFI))
-      return false;
-    if (PSVb && ValA && !PSVb->mayAlias(&MFI))
-      return false;
-    if (PSVa && PSVb && (PSVa == PSVb))
-      SameVal = true;
-  }
+  // Skip if there are too many memory operands.
+  auto NumChecks = getNumMemOperands() * Other.getNumMemOperands();
+  if (NumChecks > TII->getMemOperandAACheckLimit())
+    return true;
+
+  auto HasAlias = [MFI, AA, UseTBAA](const MachineMemOperand *MMOa,
+                                     const MachineMemOperand *MMOb) {
+    // The following interface to AA is fashioned after DAGCombiner::isAlias
+    // and operates with MachineMemOperand offset with some important
+    // assumptions:
+    //   - LLVM fundamentally assumes flat address spaces.
+    //   - MachineOperand offset can *only* result from legalization and
+    //     cannot affect queries other than the trivial case of overlap
+    //     checking.
+    //   - These offsets never wrap and never step outside
+    //     of allocated objects.
+    //   - There should never be any negative offsets here.
+    //
+    // FIXME: Modify API to hide this math from "user"
+    // Even before we go to AA we can reason locally about some
+    // memory objects. It can save compile time, and possibly catch some
+    // corner cases not currently covered.
+
+    int64_t OffsetA = MMOa->getOffset();
+    int64_t OffsetB = MMOb->getOffset();
+    int64_t MinOffset = std::min(OffsetA, OffsetB);
+
+    uint64_t WidthA = MMOa->getSize();
+    uint64_t WidthB = MMOb->getSize();
+    bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
+    bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
+
+    const Value *ValA = MMOa->getValue();
+    const Value *ValB = MMOb->getValue();
+    bool SameVal = (ValA && ValB && (ValA == ValB));
+    if (!SameVal) {
+      const PseudoSourceValue *PSVa = MMOa->getPseudoValue();
+      const PseudoSourceValue *PSVb = MMOb->getPseudoValue();
+      if (PSVa && ValB && !PSVa->mayAlias(&MFI))
+        return false;
+      if (PSVb && ValA && !PSVb->mayAlias(&MFI))
+        return false;
+      if (PSVa && PSVb && (PSVa == PSVb))
+        SameVal = true;
+    }
+
+    if (SameVal) {
+      if (!KnownWidthA || !KnownWidthB)
+        return true;
+      int64_t MaxOffset = std::max(OffsetA, OffsetB);
+      int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
+      return (MinOffset + LowWidth > MaxOffset);
+    }
-  if (SameVal) {
-    if (!KnownWidthA || !KnownWidthB)
+    if (!AA)
       return true;
-    int64_t MaxOffset = std::max(OffsetA, OffsetB);
-    int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
-    return (MinOffset + LowWidth > MaxOffset);
-  }
-  if (!AA)
-    return true;
+    if (!ValA || !ValB)
+      return true;
-  if (!ValA || !ValB)
-    return true;
+    assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
+    assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
-  assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
-  assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
+    int64_t OverlapA = KnownWidthA ? WidthA + OffsetA - MinOffset
+                                   : MemoryLocation::UnknownSize;
+    int64_t OverlapB = KnownWidthB ? WidthB + OffsetB - MinOffset
+                                   : MemoryLocation::UnknownSize;
-  int64_t OverlapA = KnownWidthA ? WidthA + OffsetA - MinOffset
-                                 : MemoryLocation::UnknownSize;
-  int64_t OverlapB = KnownWidthB ? WidthB + OffsetB - MinOffset
-                                 : MemoryLocation::UnknownSize;
+    AliasResult AAResult =
+        AA->alias(MemoryLocation(ValA, OverlapA,
+                                 UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
+                  MemoryLocation(ValB, OverlapB,
+                                 UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
-  AliasResult AAResult = AA->alias(
-      MemoryLocation(ValA, OverlapA,
-                     UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
-      MemoryLocation(ValB, OverlapB,
-                     UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
+    return (AAResult != NoAlias);
+  };
-  return (AAResult != NoAlias);
+  // Check each pair of memory operands from both instructions, which can't
+  // alias only if all pairs won't alias.
+  for (auto *MMOa : memoperands())
+    for (auto *MMOb : Other.memoperands())
+      if (HasAlias(MMOa, MMOb))
+        return true;
+
+  return false;
 }
/// hasOrderedMemoryRef - Return true if this instruction may have an ordered
diff --git a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
index 6850846fec06..3a768d0e3f9b 100644
--- a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
@@ -19,11 +19,11 @@ define void @test(%struct1* %fde, i32 %fd, void (i32, i32, i8*)* %func, i8* %arg
 ; A53-NEXT:    mov x19, x8
 ; A53-NEXT:    mov w0, w1
 ; A53-NEXT:    mov w9, #256
+; A53-NEXT:    stp x2, x3, [x8, #32]
+; A53-NEXT:    mov x2, x8
 ; A53-NEXT:    str q0, [x19, #16]!
 ; A53-NEXT:    str w1, [x19]
 ; A53-NEXT:    mov w1, #4
-; A53-NEXT:    stp x2, x3, [x8, #32]
-; A53-NEXT:    mov x2, x8
 ; A53-NEXT:    str q0, [x8]
 ; A53-NEXT:    strh w9, [x8, #24]
 ; A53-NEXT:    str wzr, [x8, #20]
diff --git a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
index 9942d6df99a4..693f33553591 100644
--- a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
+++ b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
@@ -503,12 +503,12 @@ define void @conv_v8f16_to_i128( <8 x half> %a, i128* %store ) {
 ; CHECK-NEXT:    vmov.32 r3, d16[1]
 ; CHECK-NEXT:    vmov.32 r1, d16[0]
 ; CHECK-NEXT:    subs r12, r12, #1
+; CHECK-NEXT:    str r12, [r0, #12]
 ; CHECK-NEXT:    sbcs r2, r2, #0
+; CHECK-NEXT:    str r2, [r0, #8]
 ; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    sbc r1, r1, #0
 ; CHECK-NEXT:    stm r0, {r1, r3}
-; CHECK-NEXT:    str r2, [r0, #8]
-; CHECK-NEXT:    str r12, [r0, #12]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index a43f564951e9..4fe8877aa8bd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1094,6 +1094,7 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    ldrd r11, r8, [r12, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #32
+; CHECK-NEXT:    strd r9, r1, [sp, #24] @ 8-byte Folded Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #-28]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r7
 ; CHECK-NEXT:    vldrw.u32 q6, [r5, #-24]
@@ -1105,13 +1106,12 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    vfma.f32 q0, q4, r6
 ; CHECK-NEXT:    vldrw.u32 q3, [r5, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r3
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #-4]
-; CHECK-NEXT:    vfma.f32 q0, q2, lr
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    vfma.f32 q0, q2, lr
+; CHECK-NEXT:    vldrw.u32 q1, [r5, #-4]
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
-; CHECK-NEXT:    strd r9, r1, [sp, #24] @ 8-byte Folded Spill
-; CHECK-NEXT:    vfma.f32 q0, q1, r8
 ; CHECK-NEXT:    cmp r0, #16
+; CHECK-NEXT:    vfma.f32 q0, q1, r8
 ; CHECK-NEXT:    blo .LBB16_7
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index e7d6a7323bc1..0fe26fbc4753 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -168,16 +168,14 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q1, q4
 ; CHECK-NEXT:    vmov s1, r7
 ; CHECK-NEXT:    vmov.32 q1[1], r6
-; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    vmov.32 q1[2], r5
 ; CHECK-NEXT:    vmov.32 q5[0], r7
+; CHECK-NEXT:    vmov.32 q1[2], r5
+; CHECK-NEXT:    vmov s9, r4
 ; CHECK-NEXT:    vmov.32 q1[3], r4
-; CHECK-NEXT:    strd r0, r10, [sp, #24]
+; CHECK-NEXT:    vdup.32 q6, r7
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #76]
 ; CHECK-NEXT:    vmov q1, q5
-; CHECK-NEXT:    vmov s9, r4
 ; CHECK-NEXT:    vmov.32 q1[1], r7
-; CHECK-NEXT:    vdup.32 q6, r7
 ; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    vmov.f32 s8, s0
 ; CHECK-NEXT:    vmov.32 q1[2], r6
@@ -185,6 +183,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q7, q6
 ; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    mov.w r8, #4
+; CHECK-NEXT:    mov.w r10, #0
 ; CHECK-NEXT:    vmov.32 q1[3], r4
 ; CHECK-NEXT:    vmov.32 q3[0], r4
 ; CHECK-NEXT:    vmov.32 q7[1], r4
@@ -192,6 +191,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov.f32 s11, s3
 ; CHECK-NEXT:    movs r1, #64
 ; CHECK-NEXT:    strh.w r8, [sp, #390]
+; CHECK-NEXT:    strd r0, r10, [sp, #24]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #44]
 ; CHECK-NEXT:    str r0, [r0]
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 600c5279ca91..1ae74c1738c7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -24,8 +24,8 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s9, s6
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s5
-; CHECK-NEXT:    strd r2, r0, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    strd r2, r0, [r1, #16]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
index ac1c814b838e..f57c9226179b 100644
--- a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
@@ -8,17 +8,17 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; THUMBV7-NEXT:    .pad #44
 ; THUMBV7-NEXT:    sub sp, #44
-; THUMBV7-NEXT:    ldrd r4, r7, [sp, #88]
-; THUMBV7-NEXT:    mov r5, r3
 ; THUMBV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
 ; THUMBV7-NEXT:    movs r0, #0
-; THUMBV7-NEXT:    strd r4, r7, [sp]
-; THUMBV7-NEXT:    mov r1, r3
+; THUMBV7-NEXT:    ldrd r4, r7, [sp, #88]
+; THUMBV7-NEXT:    mov r5, r3
 ; THUMBV7-NEXT:    strd r0, r0, [sp, #8]
+; THUMBV7-NEXT:    mov r1, r3
 ; THUMBV7-NEXT:    mov r6, r2
 ; THUMBV7-NEXT:    mov r0, r2
 ; THUMBV7-NEXT:    movs r2, #0
 ; THUMBV7-NEXT:    movs r3, #0
+; THUMBV7-NEXT:    strd r4, r7, [sp]
 ; THUMBV7-NEXT:    bl __multi3
 ; THUMBV7-NEXT:    strd r1, r0, [sp, #32] @ 8-byte Folded Spill
 ; THUMBV7-NEXT:    strd r3, r2, [sp, #24] @ 8-byte Folded Spill
diff --git a/llvm/test/CodeGen/X86/store_op_load_fold2.ll b/llvm/test/CodeGen/X86/store_op_load_fold2.ll
index 674b8d8f9384..6f088772436e 100644
--- a/llvm/test/CodeGen/X86/store_op_load_fold2.ll
+++ b/llvm/test/CodeGen/X86/store_op_load_fold2.ll
@@ -17,13 +17,12 @@ cond_true2732.preheader:                ; preds = %entry
         store i64 %tmp2676.us.us, i64* %tmp2666
         ret i32 0
-; INTEL: 	and	{{e..}}, dword ptr [356]
-; INTEL:	and	dword ptr [360], {{e..}}
-; FIXME:	mov	dword ptr [356], {{e..}}
-; The above line comes out as 'mov 360, eax', but when the register is ecx it works?
+; INTEL-DAG: 	and	{{e..}}, dword ptr [356]
+; INTEL-DAG:	and	dword ptr [360], {{e..}}
+; INTEL:	mov	dword ptr [356], {{e..}}
-; ATT: 	andl	356, %{{e..}}
-; ATT:	andl	%{{e..}}, 360
+; ATT-DAG: 	andl	356, %{{e..}}
+; ATT-DAG:	andl	%{{e..}}, 360
 ; ATT:	movl	%{{e..}}, 356
}
</cut>