92  __m128i m0 = 
loadu(&
block[
sizeof(__m128i) * 0]);
 
   93  __m128i m1 = 
loadu(&
block[
sizeof(__m128i) * 1]);
 
   94  __m128i m2 = 
loadu(&
block[
sizeof(__m128i) * 2]);
 
   95  __m128i m3 = 
loadu(&
block[
sizeof(__m128i) * 3]);
 
   97  __m128i t0, t1, t2, t3, tt;
 
  102  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 
  104  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 
  107  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   
 
  108  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 
  110  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   
 
  111  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 
  121  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 
  122  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 
  124  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 
  125  t1 = _mm_blend_epi16(tt, t1, 0xCC);
 
  126  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 
  128  t2 = _mm_unpacklo_epi64(m3, m1);
 
  129  tt = _mm_blend_epi16(t2, m2, 0xC0);
 
  130  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 
  131  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 
  132  t3 = _mm_unpackhi_epi32(m1, m3);
 
  133  tt = _mm_unpacklo_epi32(m2, t3);
 
  134  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 
  135  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 
  144  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 
  145  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 
  147  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 
  148  t1 = _mm_blend_epi16(tt, t1, 0xCC);
 
  149  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 
  151  t2 = _mm_unpacklo_epi64(m3, m1);
 
  152  tt = _mm_blend_epi16(t2, m2, 0xC0);
 
  153  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 
  154  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 
  155  t3 = _mm_unpackhi_epi32(m1, m3);
 
  156  tt = _mm_unpacklo_epi32(m2, t3);
 
  157  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 
  158  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 
  167  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 
  168  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 
  170  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 
  171  t1 = _mm_blend_epi16(tt, t1, 0xCC);
 
  172  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 
  174  t2 = _mm_unpacklo_epi64(m3, m1);
 
  175  tt = _mm_blend_epi16(t2, m2, 0xC0);
 
  176  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 
  177  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 
  178  t3 = _mm_unpackhi_epi32(m1, m3);
 
  179  tt = _mm_unpacklo_epi32(m2, t3);
 
  180  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 
  181  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 
  190  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 
  191  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 
  193  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 
  194  t1 = _mm_blend_epi16(tt, t1, 0xCC);
 
  195  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 
  197  t2 = _mm_unpacklo_epi64(m3, m1);
 
  198  tt = _mm_blend_epi16(t2, m2, 0xC0);
 
  199  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 
  200  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 
  201  t3 = _mm_unpackhi_epi32(m1, m3);
 
  202  tt = _mm_unpacklo_epi32(m2, t3);
 
  203  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 
  204  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 
  213  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 
  214  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 
  216  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 
  217  t1 = _mm_blend_epi16(tt, t1, 0xCC);
 
  218  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 
  220  t2 = _mm_unpacklo_epi64(m3, m1);
 
  221  tt = _mm_blend_epi16(t2, m2, 0xC0);
 
  222  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 
  223  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 
  224  t3 = _mm_unpackhi_epi32(m1, m3);
 
  225  tt = _mm_unpacklo_epi32(m2, t3);
 
  226  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 
  227  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 
  236  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 
  237  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 
  239  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 
  240  t1 = _mm_blend_epi16(tt, t1, 0xCC);
 
  241  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 
  243  t2 = _mm_unpacklo_epi64(m3, m1);
 
  244  tt = _mm_blend_epi16(t2, m2, 0xC0);
 
  245  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 
  246  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 
  247  t3 = _mm_unpackhi_epi32(m1, m3);
 
  248  tt = _mm_unpacklo_epi32(m2, t3);
 
  249  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 
  250  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);