引き続きSIMDの練習。

モノクロやネガのようにリニアに読み書きすればいい処理と違って、y方向近傍を読まざるを得ない座標変換が入ると、キャッシュミスによる遅延が効いてきそうですが、避けて通るわけにも行かないので挑戦してみます。
まずは画像のスケール変換を書いてみました。

static wxImage ResizeImage(wxImage &src_image, double size) {
 int src_real_w = src_image.GetWidth();
 int src_real_h = src_image.GetHeight();
 int dst_real_w = src_real_w*size;
 int dst_real_h = src_real_h*size;
 // 実際のビットマップの大きさ
 int dst_w = std::min<int>(src_real_w, src_real_w*size);
 int dst_h = std::min<int>(src_real_h, src_real_h*size);
 int src_w = dst_w*src_real_w/dst_real_w;
 int src_h = dst_h*src_real_h/dst_real_h;
 
 if (dst_w*dst_h<=4)
  return wxImage();

 unsigned char* src = src_image.GetData();
 unsigned char* src4 = (unsigned char*)malloc(src_w*src_h*3*4);

 int aligned_dst_size = dst_w*dst_h*3;
 aligned_dst_size += 3-(aligned_dst_size+3)%4;
 unsigned char* dst = (unsigned char*)malloc(aligned_dst_size);
 unsigned char* dst4 = (unsigned char*)malloc(dst_w*dst_h*3*4);
 // byte => DWORD
 for (int v=0; v<src_h; v++)
  for (int u=0; u<src_w; u++) {
   int p_s = (v*src_real_w + u)*3;
   int p_s4 = (v*src_w + u)*3*4;
   src4[p_s4 + 0] = src[p_s + 0];
   src4[p_s4 + 4] = src[p_s + 1];
   src4[p_s4 + 8] = src[p_s + 2];
  }
 switch (0) {
  case 0: {
   // bilinear
   // 参照先srcを越えない範囲
   int v_to = std::min<int>(dst_h-1, (src_h-1)*size);
   __m128 xmm_dummy;
   float dummy = 0;
   /* 水平用 */
   xmm_dummy = _mm_load1_ps(&dummy);
   // RGB一括ロードのマスク
   __m128i xmm_mask;
   xmm_mask.m128i_i32[0] = 0x000000ff;
   xmm_mask = _mm_shuffle_epi32(xmm_mask, 0);
   xmm_mask.m128i_i32[3] = 0;
   
   char *buf = (char*)_mm_malloc(16, sizeof(__m128i));
   for (int v=1; v<v_to; v++) {
     long l = v/size;
     long l1 = l+1;
     double wv = v/size - l;
     double wv1 = 1 - wv;
     // プリフェッチ
     _mm_prefetch((const char*)(src4 + l*src_w*3*4), _MM_HINT_T2);
     _mm_prefetch((const char*)(src4 + l1*src_w*3*4), _MM_HINT_T2);

     // 参照先srcを越えない範囲
     int u_to = std::min<int>(dst_w-1, (src_w-1)*size);
     for (int u=1; u<u_to; u++) {
      long k = u/size;
      long k1 = k+1;

      double wu = u/size - k;
      double wu1 = 1 - wu;
      long p_s_0_0 = (l*src_w + k)*3*4; /* (k, l) */
      long p_s_0_1 = (l*src_w + k1)*3*4; /* (k+1, l) */
      long p_s_1_0 = (l1*src_w + k)*3*4; /* (k, l+1) */
      long p_s_1_1 = (l1*src_w + k1)*3*4; /* (k+1, l+1) */
      long p_d = (v*dst_w+u)*3*4;
      /*
      dst4[p_d + 0] = (src4[p_s_0_0 + 0]*wu1 + src4[p_s_0_1 + 0]*wu)*wv1
       + (src4[p_s_1_0 + 0]*wu1 + src4[p_s_1_1 + 0]*wu)*wv;

      dst4[p_d + 4] = (src4[p_s_0_0 + 4]*wu1 + src4[p_s_0_1 + 4]*wu)*wv1
       + (src4[p_s_1_0 + 4]*wu1 + src4[p_s_1_1 + 4]*wu)*wv;

      dst4[p_d + 8] = (src4[p_s_0_0 + 8]*wu1 + src4[p_s_0_1 + 8]*wu)*wv1
       + (src4[p_s_1_0 + 8]*wu1 + src4[p_s_1_1 + 8]*wu)*wv;
      */
      __m128 xmm_dst, xmm_wu, xmm_wv;

      // DWORD転送 xmm_src_all_d: 0 [b] [g] [r]
      __m128i xmm_all_tmp;
      __m128 xmm_src_all_0, xmm_src_all_1, xmm_src_all_2, xmm_src_all_3;
      xmm_all_tmp = _mm_loadu_si128((__m128i *)(src4 + p_s_0_0));
      xmm_all_tmp = _mm_and_si128(xmm_mask, xmm_all_tmp);
      xmm_src_all_0 = _mm_cvtepi32_ps(xmm_all_tmp);
      xmm_all_tmp = _mm_loadu_si128((__m128i *)(src4 + p_s_0_1));
      xmm_all_tmp = _mm_and_si128(xmm_mask, xmm_all_tmp);
      xmm_src_all_1 = _mm_cvtepi32_ps(xmm_all_tmp);
      xmm_all_tmp = _mm_loadu_si128((__m128i *)(src4 + p_s_1_0));
      xmm_all_tmp = _mm_and_si128(xmm_mask, xmm_all_tmp);
      xmm_src_all_2 = _mm_cvtepi32_ps(xmm_all_tmp);
      xmm_all_tmp = _mm_loadu_si128((__m128i *)(src4 + p_s_1_1));
      xmm_all_tmp = _mm_and_si128(xmm_mask, xmm_all_tmp);
      xmm_src_all_3 = _mm_cvtepi32_ps(xmm_all_tmp);
      
      // 色ごとに集める xmm_rtmp: r3 r2 r1 r0
      __m128 xmm_colors_tmp[3]; // r, g, b
      xmm_colors_tmp[0] = _mm_shuffle_ps(xmm_src_all_0, xmm_src_all_0, SHUFFLEBITS(3, 3, 3, 0)); // 0 0 0 r0
      xmm_colors_tmp[0] = _mm_or_ps(xmm_colors_tmp[0], _mm_shuffle_ps(xmm_src_all_1, xmm_src_all_1, SHUFFLEBITS(3, 3, 0, 3))); // 0 0 r1 0
      xmm_colors_tmp[0] = _mm_or_ps(xmm_colors_tmp[0], _mm_shuffle_ps(xmm_src_all_2, xmm_src_all_2, SHUFFLEBITS(3, 0, 3, 3))); // 0 r2 0 0
      xmm_colors_tmp[0] = _mm_or_ps(xmm_colors_tmp[0], _mm_shuffle_ps(xmm_src_all_3, xmm_src_all_3, SHUFFLEBITS(0, 3, 3, 3))); // r3 0 0 0
      xmm_colors_tmp[1] = _mm_shuffle_ps(xmm_src_all_0, xmm_src_all_0, SHUFFLEBITS(3, 3, 3, 1));
      xmm_colors_tmp[1] = _mm_or_ps(xmm_colors_tmp[1], _mm_shuffle_ps(xmm_src_all_1, xmm_src_all_1, SHUFFLEBITS(3, 3, 1, 3)));
      xmm_colors_tmp[1] = _mm_or_ps(xmm_colors_tmp[1], _mm_shuffle_ps(xmm_src_all_2, xmm_src_all_2, SHUFFLEBITS(3, 1, 3, 3)));
      xmm_colors_tmp[1] = _mm_or_ps(xmm_colors_tmp[1], _mm_shuffle_ps(xmm_src_all_3, xmm_src_all_3, SHUFFLEBITS(1, 3, 3, 3)));
      xmm_colors_tmp[2] = _mm_shuffle_ps(xmm_src_all_0, xmm_src_all_0, SHUFFLEBITS(3, 3, 3, 2));
      xmm_colors_tmp[2] = _mm_or_ps(xmm_colors_tmp[2], _mm_shuffle_ps(xmm_src_all_1, xmm_src_all_1, SHUFFLEBITS(3, 3, 2, 3)));
      xmm_colors_tmp[2] = _mm_or_ps(xmm_colors_tmp[2], _mm_shuffle_ps(xmm_src_all_2, xmm_src_all_2, SHUFFLEBITS(3, 2, 3, 3)));
      xmm_colors_tmp[2] = _mm_or_ps(xmm_colors_tmp[2], _mm_shuffle_ps(xmm_src_all_3, xmm_src_all_3, SHUFFLEBITS(2, 3, 3, 3)));

      /* wu1 wu wu1 wu */
      xmm_wu.m128_f32[0] = wu1;
      xmm_wu.m128_f32[1] = wu;
      xmm_wu = _mm_shuffle_ps(xmm_wu, xmm_wu, SHUFFLEBITS(1, 0, 1, 0));
      /* wv1 wv1 wv wv */
      xmm_wv.m128_f32[0] = wv1;
      xmm_wv.m128_f32[2] = wv;
      xmm_wv = _mm_shuffle_ps(xmm_wv, xmm_wv, SHUFFLEBITS(2, 2, 0, 0));

#define BILINEAR_LINE_N(COLORINDEX) { \
      /* 積 */ \
      xmm_dst = _mm_mul_ps(xmm_colors_tmp[COLORINDEX], xmm_wu); \
      xmm_dst = _mm_mul_ps(xmm_dst, xmm_wv); \
      /* 水平 */  \
      xmm_dst = _mm_hadd_ps(xmm_dst, xmm_dummy); \
      xmm_dst = _mm_hadd_ps(xmm_dst, xmm_dummy); \
      /* 結果を非テンポラルに書き込む */ \
      xmm_all_tmp = _mm_cvtps_epi32(xmm_dst); \
      _mm_stream_si128((__m128i*)(buf), xmm_all_tmp); \
      dst4[p_d + COLORINDEX*4] = buf[0]; \
      }
      BILINEAR_LINE_N(0);
      BILINEAR_LINE_N(1);
      BILINEAR_LINE_N(2);
#undef BILINEAR_LINE_N
     }
   }
   _mm_free(buf);
  } break;
 }

 // DWORD => byte
 for (int v=0; v<dst_h; v++)
  for (int u=0; u<dst_w; u++) {
   int p_d = (v*dst_w + u)*3;
   int p_d4 = p_d*4;
   dst[p_d + 0] = dst4[p_d4 + 0];
   dst[p_d + 1] = dst4[p_d4 + 4];
   dst[p_d + 2] = dst4[p_d4 + 8];
  }
 
 free((void*)src4);
 free((void*)dst4);

 wxImage result = wxImage(dst_w, dst_h);
 result.SetData(dst);
 return result;
}

とりあえず動くことは動く。絶望的に遅いので特に評価はしていませんが…
プリフェッチ命令とか非テンポラルストアとか、意味があるのか分からないものの使ってみました。
命令とコストの対応が分かってないので、何をやるにしても方針が立ちません。やはり一度真面目に勉強するべきか。

追記: float(ps)は遅そうなので、ふつうのCでSSE使わずに、256倍して整数で書いてみました。結果は2倍以上高速に…
やはり命令の使い方の前にコストを知る必要がありそうです。