@@ -79,18 +79,18 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
79
79
; CHECK-NEXT: movxm r6, #15616; vmul.f bmh2, x0, x3, r1
80
80
; CHECK-NEXT: movxm r7, #16000
81
81
; CHECK-NEXT: vbcst.16 x1, r3
82
- ; CHECK-NEXT: vbcst.16 x10 , r4
83
- ; CHECK-NEXT: vbcst.16 x8 , r5; vmul.f bmh3, x0, x3, r1
82
+ ; CHECK-NEXT: vbcst.16 x8 , r4
83
+ ; CHECK-NEXT: vbcst.16 x10 , r5; vmul.f bmh3, x0, x3, r1
84
84
; CHECK-NEXT: vbcst.16 x6, r6
85
85
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh2; vbcst.16 x4, r7
86
86
; CHECK-NEXT: vmov wh6, wl2
87
87
; CHECK-NEXT: vmin_ge.bf16 x3, r16, x3, x1
88
- ; CHECK-NEXT: or r8, r16, r16; vmax_lt.bf16 x3, r16, x3, x10
89
- ; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh3; vband x7, x8 , x3
88
+ ; CHECK-NEXT: or r8, r16, r16; vmax_lt.bf16 x3, r16, x3, x8
89
+ ; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh3; vband x7, x10 , x3
90
90
; CHECK-NEXT: vmov wh7, wl2
91
91
; CHECK-NEXT: vmin_ge.bf16 x5, r16, x5, x1
92
- ; CHECK-NEXT: vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x10
93
- ; CHECK-NEXT: vband x7, x8 , x5
92
+ ; CHECK-NEXT: vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x8
93
+ ; CHECK-NEXT: vband x7, x10 , x5
94
94
; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh7, wl2; vmul.f bmh2, x6, x7, r1
95
95
; CHECK-NEXT: vmov wh4, wl2
96
96
; CHECK-NEXT: vmov wh3, wl2; vmul.f bmh4, x6, x7, r1
@@ -105,16 +105,16 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
105
105
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh5; movxm le, #.L_LEnd0
106
106
; CHECK-NEXT: add.nc lc, r2, #-2
107
107
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh7; vmin_ge.bf16 x3, r16, x3, x1
108
- ; CHECK-NEXT: vmax_lt.bf16 x3, r16, x3, x10
108
+ ; CHECK-NEXT: vmax_lt.bf16 x3, r16, x3, x8
109
109
; CHECK-NEXT: mova r0, #28; vconv.bf16.fp32 wl7, bmh3; vmin_ge.bf16 x11, r16, x5, x1
110
110
; CHECK-NEXT: .p2align 4
111
111
; CHECK-NEXT: .LBB0_1: // %for.body
112
112
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
113
- ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vband x9, x8 , x3; nopv
113
+ ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vband x9, x10 , x3; nopv
114
114
; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh3, wl2
115
115
; CHECK-NEXT: nopx ; vmov wh9, wl2; vmul.f bmh5, x7, x0, r1
116
- ; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x11, x10 ; vmac.f bmh4, bmh0, x3, x4, r1
117
- ; CHECK-NEXT: vband x9, x8 , x5; vmul.f bmh2, x6, x9, r1
116
+ ; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x11, x8 ; vmac.f bmh4, bmh0, x3, x4, r1
117
+ ; CHECK-NEXT: vband x9, x10 , x5; vmul.f bmh2, x6, x9, r1
118
118
; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh6, x7, x0, r1
119
119
; CHECK-NEXT: vsub.f bml0, bmh5, bmh1, r0
120
120
; CHECK-NEXT: vmul.f bmh3, x6, x9, r1
@@ -127,29 +127,29 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
127
127
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh7
128
128
; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmsc.f bml4, bml2, x3, x5, r1
129
129
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x9, r16, x3, x1
130
- ; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32; vmax_lt.bf16 x3, r16, x9, x10
130
+ ; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32; vmax_lt.bf16 x3, r16, x9, x8
131
131
; CHECK-NEXT: .L_LEnd0:
132
132
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x11, r16, x5, x1; nopv
133
133
; CHECK-NEXT: // %bb.2:
134
- ; CHECK-NEXT: nopa ; nopb ; nopxm
134
+ ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
135
135
; CHECK-NEXT: vmov wh7, wl2
136
136
; CHECK-NEXT: vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2
137
- ; CHECK-NEXT: vmov wh6, wl2; vmul.f bmh3, x7, x0, r1
138
- ; CHECK-NEXT: vmax_lt.bf16 x10 , r16, x11, x10 ; vmul.f bmh2, x1, x0, r1
139
- ; CHECK-NEXT: vband x1, x8 , x3
140
- ; CHECK-NEXT: vband x8, x8, x10
137
+ ; CHECK-NEXT: vmul.f bmh3, x7, x0, r1
138
+ ; CHECK-NEXT: vmax_lt.bf16 x8 , r16, x11, x8 ; vmul.f bmh2, x1, x0, r1
139
+ ; CHECK-NEXT: vband x1, x10 , x3
140
+ ; CHECK-NEXT: vband x10, x10, x8
141
141
; CHECK-NEXT: vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0
142
- ; CHECK-NEXT: vmov wh8 , wl2; vsub.f bmh2, bmh2, bmh1, r0
142
+ ; CHECK-NEXT: vmov wh10 , wl2; vsub.f bmh2, bmh2, bmh1, r0
143
143
; CHECK-NEXT: vmul.f bmh2, x6, x1, r1
144
- ; CHECK-NEXT: vmov wh4, wl2; vmul.f bmh3, x6, x8 , r1
144
+ ; CHECK-NEXT: vmul.f bmh3, x6, x10 , r1
145
145
; CHECK-NEXT: vmov wh3, wl2
146
- ; CHECK-NEXT: vmov wh10 , wl2
146
+ ; CHECK-NEXT: vmov wh8 , wl2
147
147
; CHECK-NEXT: vst.conv.bf16.fp32 bmh3, [p1], #32; vmac.f bmh4, bmh0, x3, x4, r1
148
- ; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x10 , x4, r1
149
- ; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh2
148
+ ; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x8 , x4, r1
149
+ ; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh2; vmov wh4, wl2
150
150
; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh3
151
151
; CHECK-NEXT: vmsc.f bmh2, bmh4, x4, x3, r1
152
- ; CHECK-NEXT: vmsc.f bmh0, bmh0, x4, x10 , r1
152
+ ; CHECK-NEXT: vmsc.f bmh0, bmh0, x4, x8 , r1
153
153
; CHECK-NEXT: nop
154
154
; CHECK-NEXT: nop
155
155
; CHECK-NEXT: nop
0 commit comments