「最適化・並列化コードを生み出す最新コンパイラ「インテル コンパイラー」サンプルコード

このページでは『最適化・並列化コードを生み出す最新コンパイラ「インテル コンパイラー」』記事内で使用したソースコードをまとめています。

リスト1 「コンパイラを変えるだけでパフォーマンス向上、インテル コンパイラの実力を見る」記事内で使用した原始的な行列演算プログラム

/*
 * Matrix calculation test program
 * matrix1.c
 */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

#define N 400
#define REPEAT 100
#define AT(matrix,i,j) (matrix[i*dim+j])

static unsigned long next_rand = 1;
void print_matrix( double* matrix, int dim );
double my_rand(void);
void prod_matrix( double* matrixA, double* matrixB, double* matrixC, int dim );
void cp_matrix( double* matrixA, double* matrixB, int dim );
void mult_matrix( double* matrixA, double* matrixB, double x, int dim );

int main( int argc, const char **argv) {
  int i, j;
  int rep;
  int dim = N;
  double* matrixA;
  double* matrixB;
  double* matrixC;
  clock_t clock_start, clock_end;

  clock_start = clock();

  /* allocate matrix */
  matrixA = (double*)malloc( sizeof(double) * dim * dim );
  matrixB = (double*)malloc( sizeof(double) * dim * dim );
  matrixC = (double*)malloc( sizeof(double) * dim * dim );

  if( (matrixA == NULL) ||
      (matrixB == NULL) ||
      (matrixC == NULL) ) {
    printf( "cannot allocate memory. exit.\n" );
    free(matrixA);
    free(matrixB);
    free(matrixC);
    return -1;
  }

  /* initialize matrix */
  for( i = 0; i < dim; i++ ) {
    for( j = 0; j < dim; j++ ) {
      AT(matrixA,i,j) = my_rand();
      AT(matrixB,i,j) = my_rand();
    }
  }

  for( rep = 0; rep < REPEAT; rep++ ) {
    prod_matrix( matrixC, matrixA, matrixB, dim );
    mult_matrix( matrixA, matrixC, 2.0/(double)dim, dim );
  }


  /*  print_matrix(matrixC, dim);*/
  free(matrixA);
  free(matrixB);
  free(matrixC);

  clock_end = clock();

  printf( "start: %d end: %d elapse: %d.\n", clock_start, clock_end, clock_end - clock_start );
  printf( "elapse time: %lf sec.", ((double)(clock_end - clock_start)) / (double)CLOCKS_PER_SEC );

  return 0;
}


/* A = B; copy matrix B to matrix A */
void cp_matrix( double* matrixA, double* matrixB,int dim ) {
  memcpy( matrixA, matrixB, sizeof(double)*dim*dim );
}

/* A = B * C; matrixA = matrixB * matrixC */
void prod_matrix( double* matrixA, double* matrixB, double* matrixC, int dim ) {
  int i, j, m;

  memset( matrixA, 0, sizeof(double)*dim*dim );

  for( i = 0; i < dim; i++ ) {
    for( m = 0; m < dim; m++ ) {
      for( j = 0; j < dim; j++ ) {
        AT(matrixA,i,j) += AT(matrixB,i,m) * AT(matrixC, m, j);
      }
    }
  }
}

/* matrixA = matrixB * x */
void mult_matrix( double* matrixA, double* matrixB, double x, int dim ) {
  int i, j;

  for( i = 0; i < dim; i++ ) {
    for( j = 0; j < dim; j++ ) {
      AT(matrixA,i,j) = AT(matrixB,i,j) * x;
    }
  }
}

void print_matrix( double* matrix, int dim ) {
  int i,j;

  for( i = 0; i < dim; i++ ) {
    for( j = 0; j < dim; j++ ) {
      // output
      printf( "%lf ", AT(matrix,i,j) );
    }
    printf( "\n" );
  }

}

double my_rand(void) {
    next_rand = next_rand * 1103515245 + 12345;
    return (((double)((unsigned)(next_rand/65536) % 32768))/32768);
}

リスト2 『インテル謹製の数値演算ライブラリ「MKL」を使ってプログラムを高速化』記事内で使用したBLASを使った行列演算プログラム

/* mkl1.c */

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <time.h>
#include <mkl_cblas.h>


#define ITEST_PARAM_N 100
#define ITEST_PARAM_REPEAT 100
#define AT(matrix,i,j) (matrix[i*dim+j])

static unsigned long next_rand = 1;
void print_matrix( double* matrix, int dim );
double my_rand(void);
void prod_matrix_with_blas( double* matrixA, double* matrixB, double* matrixC, int dim );

void cp_matrix( double* matrixA, double* matrixB,int dim );
void mult_matrix( double* matrixA, double* matrixB, double x, int dim );
void mult_matrix_with_blas( double* matrixA, double x, int dim );

int main( int argc, const char **argv) {
  int i, j;
  int rep;
  int dim = ITEST_PARAM_N;
  double* matrixA;
  double* matrixB;
  double* matrixC;
  clock_t clock_start, clock_end;

  clock_start = clock();

  /* allocate matrix */
  matrixA = (double*)malloc( sizeof(double) * dim * dim );
  matrixB = (double*)malloc( sizeof(double) * dim * dim );
  matrixC = (double*)malloc( sizeof(double) * dim * dim );

  if( (matrixA == NULL) ||
      (matrixB == NULL) ||
      (matrixC == NULL) ) {
    printf( "cannot allocate memory. exit.\n");
    free(matrixA);
    free(matrixB);
    free(matrixC);
    return -1;
  }

  /* initialize matrix */
  for( i = 0; i < dim; i++ ) {
    for( j = 0; j < dim; j++ ) {
      AT(matrixA,i,j) = my_rand();
      AT(matrixB,i,j) = my_rand();
    }
  }


  mult_matrix_with_blas( matrixB, 2.0/(double)dim, dim );

#ifdef TEST_SHOW_RESULT
  print_matrix(matrixB, dim);
#endif

  /* calculate C = A*(B/a)^REPEAT */
  for( rep = 0; rep < ITEST_PARAM_REPEAT; rep++ ) {
    prod_matrix_with_blas( matrixC, matrixA, matrixB, dim );
  }

#ifdef TEST_SHOW_RESULT
  print_matrix(matrixC, dim);
#endif

  free(matrixA);
  free(matrixB);
  free(matrixC);

  clock_end = clock();

  printf( "start: %d end: %d elapse: %d.\n", clock_start, clock_end, clock_end - clock_start );
  printf( "elapse time: %lf sec.", ((double)(clock_end - clock_start)) / (double)CLOCKS_PER_SEC );

  return 0;
}

/* A = B; copy matrix B to matrix A */
void cp_matrix( double* matrixA, double* matrixB,int dim ) {
  memcpy( matrixA, matrixB, sizeof(double)*dim*dim );
}

/* A = B * C; matrixA = matrixB * matrixC */
/*
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB,
                 const int M, const int N, const int K, const double alpha,
                 const double *A, const int lda,
                 const double *B, const int ldb,
                 const double beta, double *C, const int ldc);
A:(M,K)
B:(K,N)
C:(M,N)
C = ALPHA*A*B + BETA*C
LDA,LDB,LDC=DIM
 */

void prod_matrix_with_blas( double* matrixA, double* matrixB, double* matrixC, int dim ) {
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
              dim, dim, dim, 1.0,
              matrixB, dim,
              matrixC, dim, 0.0, matrixA, dim);
}

/* matrixA = matrixB * x */
void mult_matrix( double* matrixA, double* matrixB, double x, int dim ) {
  int i, j;

  for( i = 0; i < dim; i++ ) {
    for( j = 0; j < dim; j++ ) {
      AT(matrixA,i,j) = AT(matrixB,i,j) * x;
    }
  }
}

/* matrixA = matrixA * x */
/*
  SUBROUTINE DSCAL(N,DA,DX,INCX)
  DX = DA * DX
  DA:scalar
  DX:(N)
  INCX:1
 */
void mult_matrix_with_blas( double* matrixA, double x, int dim ) {
  cblas_dscal( dim*dim, x, matrixA, 1 );
}

void print_matrix( double* matrix, int dim ) {
  int i,j;

  for( i = 0; i < dim; i++ ) {
    for( j = 0; j < dim; j++ ) {
      // output
      printf( "%lf ", AT(matrix,i,j) );
    }
    printf( "\n" );
  }

}

double my_rand(void) {
    next_rand = next_rand * 1103515245 + 12345;
    return (((double)((unsigned)(next_rand/65536) % 32768))/32768);
}

リスト3 『インテル謹製の数値演算ライブラリ「MKL」を使ってプログラムを高速化』記事内で使用したFFTWによるFFT処理プログラム

/* fft1.c */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <memory.h>
#include <math.h>
#include <time.h>

#include "fftw3.h"

/*
  fft1 <input file> <output_file>
 */

#define FMODE_WRITE "w"
#define NORM(x,y) (sqrt(x*x+y*y))
#define POWR(x,y) (x*x+y*y)


#define c_re(x) (x[0])
#define c_im(x) (x[1])

/* prototypes */
unsigned char* load_image( char* filename,
                           int* pWidth,
                           int* pHeight,
                           int* pDepth );
void write_image( char* filename,
                  unsigned char* buff,
                  int width,
                  int height,
                  int depth );


unsigned char* load_image( char* filename,
                           int* pWidth,
                           int* pHeight,
                           int* pDepth ) {
    FILE* fp_in;
    unsigned char* buff;
    char szBuf[512];
    int height = 0;
    int width = 0;
    int depth = 0;

    fp_in = fopen( filename, "r" );
    if( fp_in == NULL ) {
        fprintf( stderr, "cannot open file: %s\n", filename );
        return NULL;
    }

    /* read PGM Header */
    fgets( szBuf, 512, fp_in );
    if( strlen( szBuf ) > 3 ) {
        fprintf( stderr, "invalid image file: %s\n", filename );
        fclose( fp_in );
        return NULL;
    }
    if( strcmp( szBuf, "P5\n" ) != 0 ) {
        fprintf( stderr, "invalid image file: %s\n", filename );
        fclose( fp_in );
        return NULL;
    }

    /* read #comment */
    fgets( szBuf, 512, fp_in );
    while( szBuf[0] == '#' ) {
        while( (szBuf[strlen(szBuf)-1] != '\n') && (!feof(fp_in)) ) {
            fgets( szBuf, 512, fp_in );
        }
        fgets( szBuf, 512, fp_in );
    }

    if(feof(fp_in)) {
        fprintf( stderr, "invalid image file: %s\n", filename );
        fclose( fp_in );
        return NULL;
    }

    /* read image size */
    if (szBuf[strlen(szBuf)-1] != '\n') {
        fprintf( stderr, "image file is strange: %s\n", filename );
        fclose( fp_in );
        return NULL;
    }
    sscanf( szBuf, "%d %d", &width, &height );
    if( (width == 0) || (height == 0) ) {
        fprintf( stderr, "image file is strange: %s\n", filename );
        fclose( fp_in );
        return NULL;
    }
    fgets( szBuf, 512, fp_in );

    /* read depth */
    if (szBuf[strlen(szBuf)-1] != '\n') {
        fprintf( stderr, "image file is strange: %s\n", filename );
        fclose( fp_in );
        return NULL;
    }
    sscanf( szBuf, "%d", &depth );
    if( depth == 0 ) {
        fprintf( stderr, "image file is strange: %s\n", filename );
        fclose( fp_in );
        return NULL;
    }

    printf( "image size: %d x %d, depth: %d\n", width, height, depth );

    /* allocate buffer */
    buff = (unsigned char*)malloc( width * height * sizeof(unsigned char) );
    if( buff == NULL ) {
        fclose( fp_in );
        return NULL;
    }

    fread( buff, 1, width*height, fp_in );
    fclose( fp_in );

    *pWidth = width;
    *pHeight = height;
    *pDepth = depth;

    return buff;
}

void write_image( char* filename,
                  unsigned char* buff,
                  int width,
                  int height,
                  int depth ) {
    FILE* fp_out;

    fp_out = fopen( filename, FMODE_WRITE );
    if( fp_out == NULL ) {
        fprintf( stderr, "cannot open file: %s\n", filename );
        return;
    }

    /* write PGM Header */
    fprintf( fp_out, "P5\n" );

    /* write image size */
    fprintf( fp_out, "%d %d\n", width, height );

    /* write depth */
    fprintf( fp_out, "%d\n", depth );
    fwrite( buff, 1, width*height, fp_out );

    if( ferror(fp_out) ) {
        fprintf( stderr, "file output error: %s\n", filename );
        return;
    }

}

void FFTWExchange(int sizex,int sizey, fftw_complex *src, fftw_complex *dst) {
  int i,j,u,v;
  int imgxd2=sizex/2, imgyd2=sizey/2;
  for(j=0;j<sizey;j++){
    for(i=0;i<sizex;i++){
      u = (i<imgxd2)? imgxd2+i: i-imgxd2;
      v = (j<imgyd2)? imgyd2+j: j-imgyd2;
      c_re(dst[j*sizex+i]) = c_re(src[u+sizey*v]);
      c_im(dst[j*sizex+i]) = c_im(src[u+sizey*v]);
    }
  }
}

unsigned char* execute_fft( unsigned char* input_image_buf,
                            int width,
                            int height,
                            int depth ) {

  fftw_complex *in, *out;
  fftw_plan p;
  unsigned char* output_image_buf;
  int n;
  int i,j;
  double dMax, dMin, dNorm;
  double* pTmp;
  dMax = 0.0;
  dMin = 0.0;

  pTmp = (double*)malloc( width*height*sizeof(double) );
  if( pTmp == NULL ) {
    return NULL;
  }

  /* allocate memory */
  in  = fftw_malloc(sizeof(fftw_complex) * width * height);
  out = fftw_malloc(sizeof(fftw_complex) * width * height);

  /* copy data to fftw buffer */
  for( n = 0; n < width*height; n++ ) {
    in[n][0] = (float)input_image_buf[n];
    in[n][1] = 0.0;
  }

  /* create plan */
  p = fftw_plan_dft_2d(height, width, in, out, FFTW_FORWARD, FFTW_ESTIMATE);

  /* do FFT */
  fftw_execute(p);

  /* allocate buffer */
  output_image_buf =
    (unsigned char*)malloc( width * height * sizeof(unsigned char) );

  if( output_image_buf != NULL ) {
    /* copy data to fftw buffer */
    for( n = 0; n < width*height; n++ ) {
      pTmp[n] = log(POWR(out[n][0],out[n][1]) + 1);
      if( pTmp[n] < dMin ) {
        dMin = pTmp[n];
      }
      if( dMax < pTmp[n] ) {
        dMax = pTmp[n];
      }
    }

   for( n = 0; n < width*height; n++ ) {
     double dWork;
     dWork = (pTmp[n] * 255.0 / (dMax - dMin));
     output_image_buf[n] = (unsigned char)(dWork);
    }
  }

  /* free memories */
  fftwnd_destroy_plan(p);

  free( pTmp );
  fftw_free(in);
  fftw_free(out);

  return output_image_buf;

}

unsigned char* execute_ifft( unsigned char* input_image_buf,
                            int width,
                            int height,
                            int depth ) {

  fftw_complex *in, *out;
  fftw_plan p;
  unsigned char* output_image_buf;
  int n;

  /* allocate memory and create plan */
  in  = fftw_malloc(sizeof(fftw_complex) * width * height);
  out = fftw_malloc(sizeof(fftw_complex) * width * height);

  p   = fftw_plan_dft_2d(height, width, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);

  /* allocate output buffer */


  /* copy data to fftw buffer */
  for( n = 0; n < width*height; n++ ) {
    in[n][0] = input_image_buf[n]; /* Re */
    in[n][1] = 0; /* Im */
  }

  /* do FFT */
  fftw_execute(p);

  /* allocate buffer */
  output_image_buf =
    (unsigned char*)malloc( width * height * sizeof(unsigned char) );

  if( output_image_buf != NULL ) {
    /* copy data to fftw buffer */
    for( n = 0; n < width*height; n++ ) {
      output_image_buf[n] = out[n][0];
    }
  }

  /* free memories */
  /*fftwnd_destroy_plan(p);*/
  fftw_free(in);
  fftw_free(out);

  return output_image_buf;

}



int main( int argv, char** argc ) {
    char* in_file_path;
    char* out_file_path;
    unsigned char* input_image_buf;
    unsigned char* output_image_buf;
    unsigned char* image_buf;
    int width;
    int height;
    int depth;
    clock_t clock_start, clock_end;

    if( argv < 3 ) {
        printf( "%s <input_file> <output_file>\n", argc[0] );
        return -1;
    }
    in_file_path = argc[1];
    out_file_path = argc[2];

    input_image_buf = load_image( in_file_path, &width, &height, &depth );
    if( input_image_buf == NULL ) {
        fprintf( stderr, "cannot load image. exit.\n" );
        return -1;
    }

    clock_start = clock();

    output_image_buf =
      execute_fft( input_image_buf, width, height, depth );

    clock_end = clock();

    if( output_image_buf != NULL ) {
      write_image( out_file_path, output_image_buf, width, height, depth );
    } else {
      fprintf( stderr, "FFT operation faild.\n" );
    }

    free( input_image_buf );
    free( output_image_buf );

    printf( "start: %d end: %d elapse: %d.\n", clock_start, clock_end, clock_end - clock_start );
    printf( "elapse time: %lf sec.", ((double)(clock_end - clock_start)) / (double)CLOCKS_PER_SEC );

    /*    free( image_buf ); */
    return 0;
};

リスト4 『マルチメディア処理から信号処理まで幅広く活用できる高速ライブラリ「IPP」』記事内で使用したIPPのサンプルデコードツール用設定ファイル

/* CommentLine: H.264 Sequence.*/
test1.yuv          /* name of source files */
1500               /* number of frames to encode */
1 250 1            /* 1(reserved) , N (# of frames in between I frames), IDR interval. */
0 1                /* Number of B frames between I (or P) and next P, treat B as a reference (only 0 is supported!) */
1 1 0              /* num_ref_frames (2-16), minimum length of list1 for backward prediction (only 1 is supported!), number of slices. */
77 0               /* profile_idc (77-main, 100-high); level_idc (set 0 for automatic selection) (check that num_ref_frames and frame size are in accordance with the level)  */
1280               /* horizontal_size */
720                /* vertical_size */
4                  /* frame_rate_code [0,8] (0-30 fps,1-15 fps,2-24 fps,3-25 fps,4-30 fps,5-30 fps,6-50 fps,7-60 fps,8-60 fps)*/
1 8 8              /* High profile: chroma_format_idc (0 - monochrom, 1 - 420, 2 - 422), bit_depth_luma [8,12], bit_depth_chroma [8,12] */
0 8 0 0 0          /* High profile: aux_format_idc: [0,3], bit_depth_aux: [8,12], alpha_incr_flag: 0, 1; alpha_opaque_value: [0, 2^(bit_depth_aux + 9) -1]; alpha_transparent_value: [0, 2^(bit_depth_aux + 9) - 1]  */
1 20 20 20 2048000 /* RC method(0 - quant_codes, 1 - CBR MBwise, 2 - CBR framewise, 3 - Debug); start qp values for I, P, B slices; bitrate (bits per second) */
2 0 8 8            /* ME method (1-6), subblock split, search x,search_y */
0 0 0              /* weighted prediction, weighted biprediction implicit weighted biprediction (not supported!)*/
0 1                /* direct type (0 - temporal 1 - spatial 2 - auto); direct_inference_flag */
0 2 2              /* disable_deblocking_idc: 1-- off, 0 - on, 2 -- on(without crossing slice boundaries); deblocking_filter_alpha, deblocking_filter_beta */
1 0 0              /* High profile: transform_8x8_mode: 0 -- off, 1 - on; 0 -- use standard, 1 -- use default scaling matrices for 8x8 quantization; qpprime_y_zero_transform_bypass_flag: (0, 1) */
1280               /* display_horizontal_size */
720                /* display_vertical_size */
1 1                /* entropy coding mode (0-cavlc,1-cabac); cabac_init_idc (0,1,2) */
0                  /* picture coding type (0 - only FRM, 1 - only FLD , 2 - only AFRM, 3  - pure PicAFF(no MBAFF) 4 PicAFF + MBAFF). Only 0 (FRM) is supported! */
0 0                /* speed/quality grade [0,3] (0-maximum speed, 3-maximum quality); OptimalQuantization (0, 1) */