# HG changeset patch
# User Vince Weaver <vince@csl.cornell.edu>
# Date 1257285316 18000
# Node ID aef69bb302b5c60a74d4f53ede04058c262ea018
# Parent  0e5037cecaf776e18a6be727981a33144f4bde64
add support for X86 sse3 haddps instruction

This patch adds support for the sse3 haddps instruction.

The code ends up being fairly complicated, though I'm not sure
if it can be done in a more compact way.

This instruction is used by the vpr spec2k benchmark.

diff -r 2e67bb7c9b4c src/arch/x86/isa/decoder/two_byte_opcodes.isa
--- a/src/arch/x86/isa/decoder/two_byte_opcodes.isa	Mon Nov 09 10:02:55 2009 -0500
+++ b/src/arch/x86/isa/decoder/two_byte_opcodes.isa	Mon Nov 09 20:55:17 2009 -0500
@@ -715,7 +715,7 @@
                     }
                     // repne (0xF2)
                     0x8: decode OPCODE_OP_BOTTOM3 {
-                        0x4: WarnUnimpl::haddps_Vo_Wo();
+                        0x4: HADDPS(Vo,Wo);
                         0x5: WarnUnimpl::hsubps_Vo_Wo();
                         default: UD2();
                     }
diff -r 2e67bb7c9b4c src/arch/x86/isa/insts/simd128/floating_point/arithmetic/horizontal_addition.py
--- a/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/horizontal_addition.py	Mon Nov 09 10:02:55 2009 -0500
+++ b/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/horizontal_addition.py	Mon Nov 09 20:55:17 2009 -0500
@@ -54,7 +54,90 @@
 # Authors: Gabe Black
 
 microcode = '''
-# HADDPS
+def macroop HADDPS_XMM_XMM {
+    movfp ufp1, xmmh
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    movfp ufp2, xmmh
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+    mslli ufp3, ufp3, 32, size=8, ext=0
+
+    movfp ufp1, xmml
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    movfp ufp2, xmml
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+
+    movfp ufp1, xmmhm
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    movfp ufp2, xmmhm
+    maddf ufp4, ufp1, ufp2, size=4, ext=1
+    mslli ufp4, ufp4, 32, size=8, ext=0
+
+    movfp ufp1, xmmlm
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    movfp ufp2, xmmlm
+    maddf ufp4, ufp1, ufp2, size=4, ext=1
+
+    movfp xmml, ufp3
+    movfp xmmh, ufp4
+};
+
+def macroop HADDPS_XMM_M {
+    movfp ufp1, xmmh
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    movfp ufp2, xmmh
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+    mslli ufp3, ufp3, 32, size=8, ext=0
+
+    movfp ufp1, xmml
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    movfp ufp2, xmml
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+
+    movfp xmml, ufp3
+
+    ldfp ufp1, seg, sib, "DISPLACEMENT+8", dataSize=8
+    movfp ufp2, ufp1
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+    mslli ufp3, ufp3, 32, size=8, ext=0
+
+    ldfp ufp1, seg, sib, disp, dataSize=8
+    movfp ufp2, ufp1
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+
+    movfp xmmh, ufp3
+};
+
+def macroop HADDPS_XMM_P {
+    rdip t7
+
+    movfp ufp1, xmmh
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    movfp ufp2, xmmh
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+    mslli ufp3, ufp3, 32, size=8, ext=0
+
+    movfp ufp1, xmml
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    movfp ufp2, xmml
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+
+    movfp xmml, ufp3
+
+    ldfp ufp1, seg, riprel, "DISPLACEMENT+8", dataSize=8
+    movfp ufp2, ufp1
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+    mslli ufp3, ufp3, 32, size=8, ext=0
+
+    ldfp ufp1, seg, riprel, disp, dataSize=8
+    movfp ufp2, ufp1
+    msrli ufp1, ufp1, 32, size=8, ext=0
+    maddf ufp3, ufp1, ufp2, size=4, ext=1
+
+    movfp xmmh, ufp3
+};
 
 def macroop HADDPD_XMM_XMM {
     maddf ufp1, xmmh , xmml, size=8, ext=1
