AVX と SIMD 演算と最大値の計算

AVX 命令を用いる SIMD 演算で最大値を求める

　今回は AVX 命令を用いる SIMD 演算で最大値を求めてみます．考え方は前回の総和計算（2012-03-11 - kawa0810の日記）に近いと思われます．

　最大値を求める AVX 命令は

単精度 (float) : _mm256_max_ps(__m256 , __m256);
倍精度 (double) : _mm256_max_pd(__m256d , __m256d);

です．また，if 文を羅列するのは ~~ダルイ~~ 見栄えがよくないので三項演算子を使って条件分岐の処理をしています．

最大値を求めるソースコード

　以下に AVX 命令を用いた SIMD 演算で最大値を求めるソースコードを記載します．ただし，以下のソースコードにはバグが含まれているので実行する際は気をつけてください（ $n$ が8以上ならば問題ないはずです）．また，答えの確認には max_element() を使用しています．

/*--------------------
  max_simd.cpp
  --------------------*/

#include <iostream>
#include <cstdlib>
#include <ctime>
#include <numeric>
#include <immintrin.h>
#include <algorithm>

float vec_max(const std::size_t n, float const* x){
  static const std::size_t para = 8;
  const std::size_t end = n / para;
  const std::size_t beg = end * para;

  auto vx = reinterpret_cast<__m256 const*>( x );
  auto vmax = vx[0];
  auto max = reinterpret_cast<float*>( &vmax );

  for(std::size_t i=1; i<end; ++i)
    vmax = _mm256_max_ps(vmax, vx[i]);
  for(std::size_t i=beg; i<n; ++i)
    max[0] = (max[0] < x[i]) ? x[i] : max[0];
    
  max[0] = (max[0] < max[1]) ? max[1] : max[0];
  max[0] = (max[0] < max[2]) ? max[2] : max[0];
  max[0] = (max[0] < max[3]) ? max[3] : max[0];
  max[0] = (max[0] < max[4]) ? max[4] : max[0];
  max[0] = (max[0] < max[5]) ? max[5] : max[0];
  max[0] = (max[0] < max[6]) ? max[6] : max[0];
  max[0] = (max[0] < max[7]) ? max[7] : max[0];

  return max[0];
}

double vec_max(const std::size_t n, double const* x){
  static const std::size_t para = 4;
  const std::size_t end = n / para;
  const std::size_t beg = end * para;

  auto vx = reinterpret_cast<__m256d const*>( x );
  auto vmax = vx[0];
  auto max = reinterpret_cast<double*>( &vmax );

  for(std::size_t i=1; i<end; ++i)
    vmax = _mm256_max_pd(vmax, vx[i]);
  for(std::size_t i=beg; i<n; ++i)
    max[0] = (max[0] < x[i]) ? x[i] : max[0];
    
  max[0] = (max[0] < max[1]) ? max[1] : max[0];
  max[0] = (max[0] < max[2]) ? max[2] : max[0];
  max[0] = (max[0] < max[3]) ? max[3] : max[0];

  return max[0];
}

int main(void){
  const std::size_t n = 10;
  float* x = static_cast<float*>( _mm_malloc(sizeof(float) * n, 32) );
  double* y = static_cast<double*>( _mm_malloc(sizeof(double) * n, 32) );

  srand( static_cast<unsigned>(time(NULL)) );
  for(std::size_t i=0; i<n; ++i) x[i] = static_cast<float>( rand() ) / RAND_MAX;
  for(std::size_t i=0; i<n; ++i) y[i] = static_cast<double>( rand() ) / RAND_MAX;

  auto max_s = std::max_element( x, x+n );
  auto max_d = std::max_element( y, y+n );

  std::cout << "Single Precision\n";
  std::cout << "-max_element: " << *max_s << std::endl;  
  std::cout << "-SIMD: " << vec_max(n, x) << std::endl;

  std::cout << "\nDouble Precision\n";
  std::cout << "-max_element: " << *max_d << std::endl;  
  std::cout << "-SIMD: " << vec_max(n, y) << std::endl;

  _mm_free(x);
  _mm_free(y);

  return 0;
}

上記ソースコードの問題点

　前述のソースコードにはバグを含んでいます．計算する要素数が大規模ならばおそらく問題ありませんが，小さい場合に環境によっては不具合が生じる可能性があります．具体的には，

単精度ならば要素数が7個以下
倍精度ならば要素数3個以下

の場合です．理由と対処方法は次回やります．

その1: 2012-03-15 - kawa0810の日記
その2: 2012-03-18 - kawa0810の日記