92 __m128i m0 =
loadu(&
block[
sizeof(__m128i) * 0]);
93 __m128i m1 =
loadu(&
block[
sizeof(__m128i) * 1]);
94 __m128i m2 =
loadu(&
block[
sizeof(__m128i) * 2]);
95 __m128i m3 =
loadu(&
block[
sizeof(__m128i) * 3]);
97 __m128i t0, t1, t2, t3, tt;
102 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
104 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
107 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
108 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
110 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
111 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
121 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
122 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
124 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
125 t1 = _mm_blend_epi16(tt, t1, 0xCC);
126 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
128 t2 = _mm_unpacklo_epi64(m3, m1);
129 tt = _mm_blend_epi16(t2, m2, 0xC0);
130 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
131 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
132 t3 = _mm_unpackhi_epi32(m1, m3);
133 tt = _mm_unpacklo_epi32(m2, t3);
134 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
135 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
144 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
145 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
147 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
148 t1 = _mm_blend_epi16(tt, t1, 0xCC);
149 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
151 t2 = _mm_unpacklo_epi64(m3, m1);
152 tt = _mm_blend_epi16(t2, m2, 0xC0);
153 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
154 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
155 t3 = _mm_unpackhi_epi32(m1, m3);
156 tt = _mm_unpacklo_epi32(m2, t3);
157 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
158 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
167 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
168 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
170 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
171 t1 = _mm_blend_epi16(tt, t1, 0xCC);
172 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
174 t2 = _mm_unpacklo_epi64(m3, m1);
175 tt = _mm_blend_epi16(t2, m2, 0xC0);
176 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
177 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
178 t3 = _mm_unpackhi_epi32(m1, m3);
179 tt = _mm_unpacklo_epi32(m2, t3);
180 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
181 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
190 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
191 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
193 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
194 t1 = _mm_blend_epi16(tt, t1, 0xCC);
195 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
197 t2 = _mm_unpacklo_epi64(m3, m1);
198 tt = _mm_blend_epi16(t2, m2, 0xC0);
199 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
200 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
201 t3 = _mm_unpackhi_epi32(m1, m3);
202 tt = _mm_unpacklo_epi32(m2, t3);
203 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
204 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
213 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
214 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
216 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
217 t1 = _mm_blend_epi16(tt, t1, 0xCC);
218 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
220 t2 = _mm_unpacklo_epi64(m3, m1);
221 tt = _mm_blend_epi16(t2, m2, 0xC0);
222 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
223 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
224 t3 = _mm_unpackhi_epi32(m1, m3);
225 tt = _mm_unpacklo_epi32(m2, t3);
226 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
227 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
236 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
237 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
239 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
240 t1 = _mm_blend_epi16(tt, t1, 0xCC);
241 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
243 t2 = _mm_unpacklo_epi64(m3, m1);
244 tt = _mm_blend_epi16(t2, m2, 0xC0);
245 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
246 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
247 t3 = _mm_unpackhi_epi32(m1, m3);
248 tt = _mm_unpacklo_epi32(m2, t3);
249 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
250 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);