98 __m128i m0 =
loadu(&
block[
sizeof(__m128i) * 0]);
99 __m128i m1 =
loadu(&
block[
sizeof(__m128i) * 1]);
100 __m128i m2 =
loadu(&
block[
sizeof(__m128i) * 2]);
101 __m128i m3 =
loadu(&
block[
sizeof(__m128i) * 3]);
103 __m128i t0, t1, t2, t3, tt;
108 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
110 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
113 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
114 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
116 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
117 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
127 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
128 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
130 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
132 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
134 t2 = _mm_unpacklo_epi64(m3, m1);
136 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
137 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
138 t3 = _mm_unpackhi_epi32(m1, m3);
139 tt = _mm_unpacklo_epi32(m2, t3);
140 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
141 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
150 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
151 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
153 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
155 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
157 t2 = _mm_unpacklo_epi64(m3, m1);
159 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
160 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
161 t3 = _mm_unpackhi_epi32(m1, m3);
162 tt = _mm_unpacklo_epi32(m2, t3);
163 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
164 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
173 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
174 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
176 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
178 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
180 t2 = _mm_unpacklo_epi64(m3, m1);
182 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
183 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
184 t3 = _mm_unpackhi_epi32(m1, m3);
185 tt = _mm_unpacklo_epi32(m2, t3);
186 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
187 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
196 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
197 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
199 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
201 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
203 t2 = _mm_unpacklo_epi64(m3, m1);
205 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
206 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
207 t3 = _mm_unpackhi_epi32(m1, m3);
208 tt = _mm_unpacklo_epi32(m2, t3);
209 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
210 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
219 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
220 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
222 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
224 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
226 t2 = _mm_unpacklo_epi64(m3, m1);
228 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
229 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
230 t3 = _mm_unpackhi_epi32(m1, m3);
231 tt = _mm_unpacklo_epi32(m2, t3);
232 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
233 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
242 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
243 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
245 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
247 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
249 t2 = _mm_unpacklo_epi64(m3, m1);
251 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
252 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
253 t3 = _mm_unpackhi_epi32(m1, m3);
254 tt = _mm_unpacklo_epi32(m2, t3);
255 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
256 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);