diff --git a/README.md b/README.md index 59ab57d..6385349 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ The APV codec standard has the following features: ## How to use ### Encoder -Encoder as input require raw YUV file (422, 444), 10-bit or more. +Encoder as input require raw YCbCr file (422, 444), 10-bit or more. Displaying help: diff --git a/app/oapv_app_dec.c b/app/oapv_app_dec.c index 819e258..15dcbbc 100644 --- a/app/oapv_app_dec.c +++ b/app/oapv_app_dec.c @@ -419,34 +419,22 @@ int main(int argc, const char **argv) if(fp_bs == NULL) { logerr("ERROR: cannot open bitstream file = %s\n", args_var->fname_inp); print_usage(argv); - return -1; + ret = -1; goto ERR; } /* open output file */ if(strlen(args_var->fname_out) > 0) { - char fext[16]; - char *fname = (char *)args_var->fname_out; - - if(strlen(fname) < 5) { /* at least x.yuv or x.y4m */ - logerr("ERROR: invalide output file name\n"); - return -1; + ret = check_file_name_type(args_var->fname_out); + if(ret > 0) { + is_y4m = 1; } - strncpy(fext, fname + strlen(fname) - 3, sizeof(fext) - 1); - fext[0] = toupper(fext[0]); - fext[1] = toupper(fext[1]); - fext[2] = toupper(fext[2]); - - if(strcmp(fext, "YUV") == 0) { + else if(ret == 0) { is_y4m = 0; } - else if(strcmp(fext, "Y4M") == 0) { - is_y4m = 1; - } - else { - logerr("ERROR: unknown output format\n"); - ret = -1; - goto ERR; + else { // invalid or unknown file name type + logerr("unknown file type name for decoded video\n"); + ret = -1; goto ERR; } - clear_data(fname); /* remove decoded file contents if exists */ + clear_data(args_var->fname_out); /* remove decoded file contents if exists */ } // create bitstream buffer @@ -609,10 +597,14 @@ int main(int argc, const char **argv) if(write_y4m_header(args_var->fname_out, imgb_o)) { logerr("cannot write Y4M header\n"); ret = -1; - goto END; + goto ERR; } } - write_dec_img(args_var->fname_out, imgb_o, is_y4m); + if(write_dec_img(args_var->fname_out, imgb_o, is_y4m)) { + logerr("cannot write decoded video\n"); + ret = -1; + goto ERR; + } } frm_cnt[i]++; } diff --git a/app/oapv_app_enc.c b/app/oapv_app_enc.c index 6fd40c4..3566ca4 100644 --- a/app/oapv_app_enc.c +++ b/app/oapv_app_enc.c @@ -100,12 +100,11 @@ static const args_opt_t enc_args_opts[] = { { ARGS_NO_KEY, "input-csp", ARGS_VAL_TYPE_INTEGER, 0, NULL, "input color space (chroma format)\n" - " - 0: YUV400\n" - " - 1: YUV420\n" - " - 2: YUV422\n" - " - 3: YUV444\n" - " - 4: YUV4444\n" - " - 5: P2(Planar Y, Combined UV, 422)" + " - 0: 400\n" + " - 2: 422\n" + " - 3: 444\n" + " - 4: 4444\n" + " - 5: P2(Planar Y, Combined CbCr, 422)" }, { ARGS_NO_KEY, "profile", ARGS_VAL_TYPE_STRING, 0, NULL, @@ -154,20 +153,20 @@ static const args_opt_t enc_args_opts[] = { "user filler flag" }, { - ARGS_NO_KEY, "q-matrix-y", ARGS_VAL_TYPE_STRING, 0, NULL, - "custom quantization matrix for Y \"q1 q2 ... q63 q64\"" + ARGS_NO_KEY, "q-matrix-c0", ARGS_VAL_TYPE_STRING, 0, NULL, + "custom quantization matrix for component 0 (Y) \"q1 q2 ... q63 q64\"" }, { - ARGS_NO_KEY, "q-matrix-u", ARGS_VAL_TYPE_STRING, 0, NULL, - "custom quantization matrix for U \"q1 q2 ... q63 q64\"" + ARGS_NO_KEY, "q-matrix-c1", ARGS_VAL_TYPE_STRING, 0, NULL, + "custom quantization matrix for component 1 (Cb) \"q1 q2 ... q63 q64\"" }, { - ARGS_NO_KEY, "q-matrix-v", ARGS_VAL_TYPE_STRING, 0, NULL, - "custom quantization matrix for V \"q1 q2 ... q63 q64\"" + ARGS_NO_KEY, "q-matrix-c2", ARGS_VAL_TYPE_STRING, 0, NULL, + "custom quantization matrix for component 2 (Cr) \"q1 q2 ... q63 q64\"" }, { - ARGS_NO_KEY, "q-matrix-x", ARGS_VAL_TYPE_STRING, 0, NULL, - "custom quantization matrix for X \"q1 q2 ... q63 q64\"" + ARGS_NO_KEY, "q-matrix-c3", ARGS_VAL_TYPE_STRING, 0, NULL, + "custom quantization matrix for component 3 \"q1 q2 ... q63 q64\"" }, { ARGS_NO_KEY, "hash", ARGS_VAL_TYPE_NONE, 0, NULL, @@ -196,10 +195,7 @@ typedef struct args_var { int band; char bitrate[64]; char fps[256]; - char q_matrix_y[512]; - char q_matrix_u[512]; - char q_matrix_v[512]; - char q_matrix_x[512]; + char q_matrix[OAPV_MAX_CC][512]; // raster-scan order char preset[32]; oapve_param_t *param; } args_var_t; @@ -230,26 +226,26 @@ static args_var_t *args_init_vars(args_parser_t *args, oapve_param_t *param) vars->input_csp = -1; args_set_variable_by_key_long(opts, "seek", &vars->seek); args_set_variable_by_key_long(opts, "profile", vars->profile); - strncpy(vars->profile, "422-10", sizeof(vars->profile) - 1); + strcpy(vars->profile, "422-10"); args_set_variable_by_key_long(opts, "level", vars->level); - strncpy(vars->level, "4.1", sizeof(vars->level) - 1); + strcpy(vars->level, "4.1"); args_set_variable_by_key_long(opts, "band", &vars->band); vars->band = 2; /* default */ args_set_variable_by_key_long(opts, "bitrate", vars->bitrate); args_set_variable_by_key_long(opts, "fps", vars->fps); - strncpy(vars->fps, "60", sizeof(vars->fps) - 1); - args_set_variable_by_key_long(opts, "q-matrix-y", vars->q_matrix_y); - strncpy(vars->q_matrix_y, "", sizeof(vars->q_matrix_y) - 1); - args_set_variable_by_key_long(opts, "q-matrix-u", vars->q_matrix_u); - strncpy(vars->q_matrix_u, "", sizeof(vars->q_matrix_y) - 1); - args_set_variable_by_key_long(opts, "q-matrix-v", vars->q_matrix_v); - strncpy(vars->q_matrix_v, "", sizeof(vars->q_matrix_y) - 1); - args_set_variable_by_key_long(opts, "q-matrix-x", vars->q_matrix_x); - strncpy(vars->q_matrix_x, "", sizeof(vars->q_matrix_x) - 1); + strcpy(vars->fps, "60"); + args_set_variable_by_key_long(opts, "q-matrix-c0", vars->q_matrix[0]); + strcpy(vars->q_matrix[0], ""); + args_set_variable_by_key_long(opts, "q-matrix-c1", vars->q_matrix[1]); + strcpy(vars->q_matrix[1], ""); + args_set_variable_by_key_long(opts, "q-matrix-c2", vars->q_matrix[2]); + strcpy(vars->q_matrix[2], ""); + args_set_variable_by_key_long(opts, "q-matrix-c3", vars->q_matrix[3]); + strcpy(vars->q_matrix[3], ""); args_set_variable_by_key_long(opts, "threads", &vars->threads); vars->threads = 1; /* default */ args_set_variable_by_key_long(opts, "preset", vars->preset); - strncpy(vars->preset, "", sizeof(vars->preset) - 1); + strcpy(vars->preset, ""); ARGS_SET_PARAM_VAR_KEY(opts, param, w); ARGS_SET_PARAM_VAR_KEY(opts, param, h); @@ -325,6 +321,17 @@ static int set_extra_config(oapve_t id, args_var_t *vars, oapve_param_t *param) return ret; } +static int write_rec_img(char *fname, oapv_imgb_t *img, int flag_y4m) +{ + if(flag_y4m) { + if(write_y4m_frame_header(fname)) + return -1; + } + if(imgb_write(fname, img)) + return -1; + return 0; +} + static void print_commandline(int argc, const char **argv) { int i; @@ -449,6 +456,7 @@ static int kbps_str_to_int(char *str) static int update_param(args_var_t *vars, oapve_param_t *param) { + int q_len[OAPV_MAX_CC]; /* update reate controller parameters */ if(strlen(vars->bitrate) > 0) { param->bitrate = kbps_str_to_int(vars->bitrate); @@ -456,117 +464,27 @@ static int update_param(args_var_t *vars, oapve_param_t *param) } /* update q_matrix */ - int len_y = (int)strlen(vars->q_matrix_y); - if(len_y > 0) { - param->use_q_matrix = 1; - char *tmp = vars->q_matrix_y; - int cnt = 0; - int len_cnt = 0; - while(len_cnt < len_y && cnt < OAPV_BLK_D) { - sscanf(tmp, "%d", ¶m->q_matrix_y[cnt]); - if(param->q_matrix_y[cnt] < 1 || param->q_matrix_y[cnt] > 255) { - logerr("input value of q_matrix_y is invalid\n"); - return -1; - } - len_cnt += (int)log10(param->q_matrix_y[cnt]) + 2; - tmp = vars->q_matrix_y + len_cnt; - cnt++; - } - if(cnt < OAPV_BLK_D) { - logerr("input number of q_matrix_y is not enough\n"); - return -1; - } - } - - int len_u = (int)strlen(vars->q_matrix_u); - if(len_u > 0) { - param->use_q_matrix = 1; - char *tmp = vars->q_matrix_u; - int cnt = 0; - int len_cnt = 0; - while(len_cnt < len_u && cnt < OAPV_BLK_D) { - sscanf(tmp, "%d", ¶m->q_matrix_u[cnt]); - if(param->q_matrix_u[cnt] < 1 || param->q_matrix_u[cnt] > 255) { - logerr("input value of q_matrix_u is invalid\n"); - return -1; - } - len_cnt += (int)log10(param->q_matrix_u[cnt]) + 2; - tmp = vars->q_matrix_u + len_cnt; - cnt++; - } - if(cnt < OAPV_BLK_D) { - logerr("input number of q_matrix_u is not enough\n"); - return -1; - } - } - - int len_v = (int)strlen(vars->q_matrix_v); - if(len_v > 0) { - param->use_q_matrix = 1; - char *tmp = vars->q_matrix_v; - int cnt = 0; - int len_cnt = 0; - while(len_cnt < len_v && cnt < OAPV_BLK_D) { - sscanf(tmp, "%d", ¶m->q_matrix_v[cnt]); - if(param->q_matrix_v[cnt] < 1 || param->q_matrix_v[cnt] > 255) { - logerr("input value of q_matrix_v is invalid\n"); - return -1; + for(int c = 0; c < OAPV_MAX_CC; c++) { + q_len[c] = (int)strlen(vars->q_matrix[c]); + if(q_len[c] > 0) { + param->use_q_matrix = 1; + char *qstr = vars->q_matrix[c]; + int qcnt = 0; + while(strlen(qstr) > 0 && qcnt < OAPV_BLK_D) { + int t0, read; + sscanf(qstr, "%d%n", &t0, &read); + if(t0 < 1 || t0 > 255) { + logerr("input value (%d) for q_matrix[%d][%d] is invalid\n", t0, c, qcnt); + return -1; + } + param->q_matrix[c][qcnt] = t0; + qstr += read; + qcnt++; } - len_cnt += (int)log10(param->q_matrix_v[cnt]) + 2; - tmp = vars->q_matrix_v + len_cnt; - cnt++; - } - if(cnt < OAPV_BLK_D) { - logerr("input number of q_matrix_v is not enough\n"); - return -1; - } - } - - int len_x = (int)strlen(vars->q_matrix_x); - if (len_x > 0) { - param->use_q_matrix = 1; - char* tmp = vars->q_matrix_x; - int cnt = 0; - int len_cnt = 0; - while (len_cnt < len_x && cnt < OAPV_BLK_D) { - sscanf(tmp, "%d", ¶m->q_matrix_x[cnt]); - if (param->q_matrix_x[cnt] < 1 || param->q_matrix_x[cnt] > 255) { - logerr("input value of q_matrix_x is invalid\n"); + if(qcnt < OAPV_BLK_D) { + logerr("input number of q_matrix[%d] is not enough\n", c); return -1; } - len_cnt += (int)log10(param->q_matrix_x[cnt]) + 2; - tmp = vars->q_matrix_x + len_cnt; - cnt++; - } - if (cnt < OAPV_BLK_D) { - logerr("input number of q_matrix_x is not enough\n"); - return -1; - } - } - - if(param->use_q_matrix) { - if(len_y == 0) { - for(int i = 0; i < OAPV_BLK_D; i++) { - param->q_matrix_y[i] = 16; - } - } - - if(len_u == 0) { - for(int i = 0; i < OAPV_BLK_D; i++) { - param->q_matrix_u[i] = 16; - } - } - - if(len_v == 0) { - for(int i = 0; i < OAPV_BLK_D; i++) { - param->q_matrix_v[i] = 16; - } - } - - if (len_x == 0) { - for (int i = 0; i < OAPV_BLK_D; i++) { - param->q_matrix_x[i] = 16; - } } } @@ -644,9 +562,10 @@ int main(int argc, const char **argv) int ret; oapv_clk_t clk_beg, clk_end, clk_tot; oapv_mtime_t au_cnt, au_skip; + int frm_cnt[MAX_NUM_FRMS] = { 0 }; double bitrate_tot; // total bitrate (byte) double psnr_avg[MAX_NUM_FRMS][MAX_NUM_CC] = { 0 }; - int is_y4m; + int is_inp_y4m, is_rec_y4m; y4m_params_t y4m; int is_out = 0, is_rec = 0; char *errstr = NULL; @@ -713,8 +632,8 @@ int main(int argc, const char **argv) } /* y4m header parsing */ - is_y4m = y4m_test(fp_inp); - if(is_y4m) { + is_inp_y4m = y4m_test(fp_inp); + if(is_inp_y4m) { if(y4m_header_parser(fp_inp, &y4m)) { logerr("This y4m is not supported (%s)\n", args_var->fname_inp); ret = -1; @@ -775,6 +694,17 @@ int main(int argc, const char **argv) } if(strlen(args_var->fname_rec) > 0) { + ret = check_file_name_type(args_var->fname_rec); + if(ret > 0) { + is_rec_y4m = 1; + } + else if(ret == 0) { + is_rec_y4m = 0; + } + else { // invalid or unknown file name type + logerr("unknown file name type for reconstructed video\n"); + ret = -1; goto ERR; + } clear_data(args_var->fname_rec); is_rec = 1; } @@ -858,7 +788,7 @@ int main(int argc, const char **argv) else { imgb_i = imgb_r; } - ret = imgb_read(fp_inp, imgb_i, param->w, param->h, is_y4m); + ret = imgb_read(fp_inp, imgb_i, param->w, param->h, is_inp_y4m); if(ret < 0) { logv3("reached out the end of input file\n"); ret = OAPV_OK; @@ -885,14 +815,14 @@ int main(int argc, const char **argv) print_stat_au(&stat, au_cnt, param, args_var->max_au, bitrate_tot, clk_end, clk_tot); - for(int i = 0; i < num_frames; i++) { + for(int fidx = 0; fidx < num_frames; fidx++) { if(is_rec) { if(args_var->input_depth != 10) { - imgb_cpy(imgb_w, rfrms.frm[i].imgb); + imgb_cpy(imgb_w, rfrms.frm[fidx].imgb); imgb_o = imgb_w; } else { - imgb_o = rfrms.frm[i].imgb; + imgb_o = rfrms.frm[fidx].imgb; } } @@ -914,16 +844,23 @@ int main(int argc, const char **argv) // store recon image if(is_rec) { - if(imgb_write(args_var->fname_rec, imgb_o)) { - logerr("cannot write reconstruction image\n"); + if(frm_cnt[fidx] == 0 && is_rec_y4m) { + if(write_y4m_header(args_var->fname_rec, imgb_o)) { + logerr("cannot write Y4M header\n"); + ret = -1; + goto ERR; + } + } + if(write_rec_img(args_var->fname_rec, imgb_o, is_rec_y4m)) { + logerr("cannot write reconstructed video\n"); ret = -1; goto ERR; } } - print_stat_frms(&stat, &ifrms, &rfrms, psnr_avg); - au_cnt++; + frm_cnt[fidx] += 1; } + au_cnt++; } else if(state == STATE_SKIPPING) { if(au_skip < args_var->seek) { diff --git a/app/oapv_app_y4m.h b/app/oapv_app_y4m.h index 659eb7b..e385028 100644 --- a/app/oapv_app_y4m.h +++ b/app/oapv_app_y4m.h @@ -290,4 +290,32 @@ static int write_y4m_frame_header(char *fname) return 0; } +// check whether file name is y4m type or not +// return +// - positive value : file name has y4m format name +// - zero : YUV format name +// - nogative value : unknown format name +static int check_file_name_type(char * fname) +{ + char fext[16]; + if(strlen(fname) < 5) { /* at least x.yuv or x.y4m */ + return -1; + } + strncpy(fext, fname + strlen(fname) - 3, sizeof(fext) - 1); + fext[0] = toupper(fext[0]); + fext[1] = toupper(fext[1]); + fext[2] = toupper(fext[2]); + + if(strcmp(fext, "YUV") == 0) { + return 0; + } + else if(strcmp(fext, "Y4M") == 0) { + return 1; + } + else { + return -1; + } + return -1; // false +} + #endif /* _OAPV_APP_Y4M_H_ */ \ No newline at end of file diff --git a/inc/oapv.h b/inc/oapv.h index fbf49e1..ba9e6b4 100644 --- a/inc/oapv.h +++ b/inc/oapv.h @@ -129,7 +129,7 @@ extern "C" { #define OAPV_CS_YCBCR420_14LE OAPV_CS_SET(OAPV_CF_YCBCR420, 14, 0) #define OAPV_CS_P210 OAPV_CS_SET(OAPV_CF_PLANAR2, 10, 0) -/* max number of color channel: YCbCr4444 -> 4 channels */ +/* max number of color channel: ex) YCbCr4444 -> 4 channels */ #define OAPV_MAX_CC (4) /***************************************************************************** @@ -342,17 +342,26 @@ struct oapv_bitb { *****************************************************************************/ typedef struct oapv_frm_info oapv_frm_info_t; struct oapv_frm_info { - int w; - int h; - int cs; - int pbu_type; - int group_id; - int profile_idc; - int level_idc; - int band_idc; - int chroma_format_idc; - int bit_depth; - int capture_time_distance; + int w; + int h; + int cs; + int pbu_type; + int group_id; + int profile_idc; + int level_idc; + int band_idc; + int chroma_format_idc; + int bit_depth; + int capture_time_distance; + /* custom quantization matrix */ + int use_q_matrix; + unsigned char q_matrix[OAPV_MAX_CC][OAPV_BLK_D]; // only meaningful if use_q_matrix is true + /* color description values */ + int color_description_present_flag; + unsigned char color_primaries; // only meaningful if color_description_present_flag is true + unsigned char transfer_characteristics; // only meaningful if color_description_present_flag is true + unsigned char matrix_coefficients; // only meaningful if color_description_present_flag is true + int full_range_flag; // only meaningful if color_description_present_flag is true }; typedef struct oapv_au_info oapv_au_info_t; @@ -367,43 +376,46 @@ struct oapv_au_info { typedef struct oapve_param oapve_param_t; struct oapve_param { /* profile_idc */ - int profile_idc; + int profile_idc; /* level */ - int level_idc; + int level_idc; /* band */ - int band_idc; + int band_idc; /* width of input frame */ - int w; + int w; /* height of input frame */ - int h; + int h; /* frame rate (Hz) numerator, denominator */ - int fps_num; - int fps_den; + int fps_num; + int fps_den; /* rate control type */ - int rc_type; + int rc_type; /* quantization parameter (0 ~ 63)*/ - int qp; + int qp; /* quantization parameter offset for CB */ - int qp_cb_offset; + int qp_cb_offset; /* quantization parameter offset for CR */ - int qp_cr_offset; + int qp_cr_offset; /* bitrate (unit: kbps) */ - int bitrate; + int bitrate; /* use filler data for tight constant bitrate */ - int use_filler; - /* use filler quantization matrix */ - int use_q_matrix; - int q_matrix_y[OAPV_BLK_D]; - int q_matrix_u[OAPV_BLK_D]; - int q_matrix_v[OAPV_BLK_D]; - int q_matrix_x[OAPV_BLK_D]; + int use_filler; + /* use quantization matrix */ + int use_q_matrix; + unsigned char q_matrix[OAPV_MAX_CC][OAPV_BLK_D]; // raster-scan order /* color space */ - int csp; - int tile_cols; - int tile_rows; - int tile_w_mb; - int tile_h_mb; - int preset; + int csp; + int tile_cols; + int tile_rows; + int tile_w_mb; + int tile_h_mb; + int preset; + /* color description values */ + int color_description_present_flag; + unsigned char color_primaries; + unsigned char transfer_characteristics; + unsigned char matrix_coefficients; + int full_range_flag; }; /***************************************************************************** diff --git a/readme/apv_isobmff.md b/readme/apv_isobmff.md index 5515ab1..2aa19cd 100644 --- a/readme/apv_isobmff.md +++ b/readme/apv_isobmff.md @@ -32,7 +32,7 @@ class APV1SmapleEntry extends VisualSampleEntry('apv1'){ ### Semantics -The value of largest_frame_width_minus1 + 1 and largest_frame_height_minus1 + 1 of the APVCodecConfigurationBox shall be used for the value of width and height fields of the VisualSampleEntry, respectively. +The value of largest_frame_width and largest_frame_height of the APVCodecConfigurationBox shall be used for the value of width and height fields of the VisualSampleEntry, respectively. When the sample entry name is 'apv1', the stream to which this sample entry applies shall be a compliant APV stream as viewed by an APV decoder operating under the configuration (including profile, level, and so on) given in the APVCodecConfigurationBox. @@ -73,8 +73,8 @@ aligned(8) class APVDecoderConfigurationBox extends FullBox('apvC',version=0, fl unsigned int(8) profile_idc[i][j]; unsigned int(8) level_idc[i][j]; unsigned int(8) band_idc[i][j]; - unsigned int(32) frame_width_minus1[i][j]; - unsigned int(32) frame_height_minus1[i][j]; + unsigned int(32) frame_width[i][j]; + unsigned int(32) frame_height[i][j]; unsigned int(4) chroma_format_idc[i][j]; unsigned int(4) bit_depth_minus8[i][j]; unsigned int(8) capture_time_distance[i][j]; @@ -82,6 +82,7 @@ aligned(8) class APVDecoderConfigurationBox extends FullBox('apvC',version=0, fl unsigned int(8) color_primaries[i][j]; unsigned int(8) transfer_characteristics[i][j]; unsigned int(8) matrix_coefficients[i][j]; + unsigned int(1) full_range_flag[i][j]; } } } @@ -91,54 +92,56 @@ aligned(8) class APVDecoderConfigurationBox extends FullBox('apvC',version=0, fl ### Semantics + number_of_configuration_entry - > indicates the number of frame header information for a specific PBU types are stored. +> indicates the number of frame header information for a specific PBU types are stored. + pbu_type[i] - - > indicates the value of the pbu_type field in the pbu header immediately preceding the frame data for a certain index i. +> indicates the value of the pbu_type field in the pbu header immediately preceding the frame data for a certain index i. + number_of_frame_info[i] - - > indicates the number of variations of the frame header information for the frames whose value of the pbu_type field in the pbu header immediately preceding it is idendtical with the value of the pub_type[i] field for a certain index i. +> indicates the number of variations of the frame header information for the frames whose value of the pbu_type field in the pbu header immediately preceding it is idendtical with the value of the pub_type[i] field for a certain index i. + color_description_present_flag[i][j] - >indicates whether the color description information is provided for the jth variation of frame header whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +>indicates whether the color description information is provided for the jth variation of frame header whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + capture_time_distance_ignored[i][j] - > indicates whether the value of the capture_time_distance field in the jth variation of frame header is used for the processing of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates whether the value of the capture_time_distance field in the jth variation of frame header is used for the processing of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + profile_idc[i][j] - > indicates the value of the profile_idc field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the profile_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of profile_idc field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates the value of the profile_idc field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the profile_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of profile_idc field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + level_idc[i][j] - > indicates the value of the level_idc field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the level_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of level_idc field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates the value of the level_idc field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the level_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of level_idc field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + band_idc[i][j] - > indicates the value of the band_idc field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the band_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of band_idc field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates the value of the band_idc field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the band_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of band_idc field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. -+ frame_width_minus1[i][j] - > indicates the value of the frame_width_minus1 field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the frame_width_minus1 field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of frame_width_minus1 field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. ++ frame_width[i][j] +> indicates the value of the frame_width field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the frame_width field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of frame_width field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. -+ frame_height_minus1[i][j] - > indicates the value of the frame_height_minus1 field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the frame_height_minus1 field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of frame_height_minus1 field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. ++ frame_height[i][j] +> indicates the value of the frame_height field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the frame_height field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of frame_height field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + chroma_format_idc[i][j] - > indicates the value of the chroma_format_idc field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the chroma_format_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of chroma_format_idc field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates the value of the chroma_format_idc field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the chroma_format_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of chroma_format_idc field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + bit_depth_minus8[i] - > indicates the value of the bit_depth_minus8 field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the bit_depth_minus8 field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of bit_depth_minus8 field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates the value of the bit_depth_minus8 field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the bit_depth_minus8 field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of bit_depth_minus8 field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + capture_time_distance[i][j] - > indicates the value of the capture_time_distance field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the capture_time_distance field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of capture_time_distance field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates the value of the capture_time_distance field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the capture_time_distance field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of capture_time_distance field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + color_primaries[i][j] - > indicates the value of the color_primaries field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the profile_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of color_primaries field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates the value of the color_primaries field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the color_primaries field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of color_primaries field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + transfer_characteristics[i][j] - > indicates the value of the transfer_characteristics field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the profile_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of transfer_characteristics field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates the value of the transfer_characteristics field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the transfer_characteristics field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of transfer_characteristics field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + matrix_coefficients[i][j] - > indicates the value of the matrix_coefficients field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the profile_idc field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of matrix_cofficients field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. +> indicates the value of the matrix_coefficients field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of the matrix_coefficients field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of matrix_cofficients field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. + ++ full_range_flag[i][j] +> indicates the value of the full_range_flag field in the jth variation of the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1, then the same value of this field must be used as the value of +the full_range_flag field in the frame header of the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. If the value of number_of_frame_info[i] is 1 is greater than 1, then the frame header in each sample must provide the value of matrix_cofficients field matched with one among the values of this field for all index j for the frames whose value of the pbu_type field in the pbu header immediately preceding it is identical with the value of the pbu_type[i] field for a certain index i. ## APV Sample Description diff --git a/src/avx/oapv_sad_avx.c b/src/avx/oapv_sad_avx.c index 67d773f..e114c3d 100644 --- a/src/avx/oapv_sad_avx.c +++ b/src/avx/oapv_sad_avx.c @@ -33,30 +33,188 @@ #if X86_SSE -static s64 ssd_16b_sse_8x8_avx(int w, int h, void* src1, void* src2, int s_src1, int s_src2, int bit_depth) +/* SAD ***********************************************************************/ +static int sad_16b_avx_8x8(int w, int h, void* src1, void* src2, int s_src1, int s_src2) { s16* s1 = (s16*)src1; s16* s2 = (s16*)src2; - int t[8] = { 0 }; - __m256i sum = _mm256_setzero_si256(); - __m256i v1, v2; - - for (int i = 0; i < 64; i += 8) - { - v1 = _mm256_loadu_si256((const __m256i*)(s1 + i)); - v2 = _mm256_loadu_si256((const __m256i*)(s2 + i)); - v2 = _mm256_sub_epi16(v1, v2); - v2 = _mm256_madd_epi16(v2, v2); - sum = _mm256_add_epi32(sum, v2); - _mm256_storeu_si256((__m256i*)(t), sum); - } - return t[0] + t[1] + t[2] + t[3]; + __m256i zero_vector = _mm256_setzero_si256(); + __m256i s1_vector, s2_vector, diff_vector, diff_abs1, diff_abs2; + // Because we are working with 16 elements at a time, stride is multiplied by 2. + s16 s1_stride = 2 * s_src1; + s16 s2_stride = 2 * s_src2; + { // Row 0 and Row 1 + // Load Row 0 and Row 1 data into registers. + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s1 += s1_stride; + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + s2 += s2_stride; + // Calculate absolute difference between two rows. + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + diff_abs1 = _mm256_abs_epi16(diff_vector); + } + { // Row 2 and Row 3 + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s1 += s1_stride; + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + s2 += s2_stride; + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + diff_abs2 = _mm256_abs_epi16(diff_vector); + } + // Add absolute differences to running total. + __m256i sum = _mm256_add_epi16(diff_abs1, diff_abs2); + { // Row 4 and Row 5 + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s1 += s1_stride; + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + s2 += s2_stride; + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + diff_abs2 = _mm256_abs_epi16(diff_vector); + sum = _mm256_add_epi16(sum, diff_abs2); + } + { // Row 6 and Row 7 + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + diff_abs2 = _mm256_abs_epi16(diff_vector); + sum = _mm256_add_epi16(sum, diff_abs2); + } + // Convert 16-bit integers to 32-bit integers for summation. + __m128i sum_low = _mm256_extracti128_si256(sum, 0); + __m128i sum_high = _mm256_extracti128_si256(sum, 1); + __m256i sum_low_32 = _mm256_cvtepi16_epi32(sum_low); + __m256i sum_high_32 = _mm256_cvtepi16_epi32(sum_high); + // Sum up all the values in the array to get final SAD value. + sum = _mm256_add_epi32(sum_low_32, sum_high_32); + __m256i sum_hadd = _mm256_hadd_epi32(sum, zero_vector); // Horizontal add with zeros + sum = _mm256_hadd_epi32(sum_hadd, zero_vector); // Horizontal add with zeros + int sum1 = _mm256_extract_epi32(sum, 0); + int sum2 = _mm256_extract_epi32(sum, 4); + int sad = sum1 + sum2; + return sad; +} + +const oapv_fn_sad_t oapv_tbl_fn_sad_16b_avx[2] = +{ + sad_16b_avx_8x8, + NULL +}; + +/* SSD ***********************************************************************/ +static s64 ssd_16b_avx_8x8(int w, int h, void* src1, void* src2, int s_src1, int s_src2) +{ + s16* s1 = (s16*)src1; + s16* s2 = (s16*)src2; + __m256i s1_vector, s2_vector, diff_vector, sq_vector1, sq_vector2; + s64 sum_arr[4]; + // Because we are working with 16 elements at a time, stride is multiplied by 2. + s16 s1_stride = 2 * s_src1; + s16 s2_stride = 2 * s_src2; + s64 ssd = 0; + { // Row 0 and Row 1 + // Load Row 0 and Row 1 data into registers. + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s1 += s1_stride; + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + s2 += s2_stride; + // Calculate squared difference between two rows. + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + sq_vector1 = _mm256_madd_epi16(diff_vector, diff_vector); + } + { // Row 2 and Row 3 + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s1 += s1_stride; + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + s2 += s2_stride; + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + sq_vector2 = _mm256_madd_epi16(diff_vector, diff_vector); + } + // Add squared differences to running total. + __m256i sum = _mm256_add_epi32(sq_vector1, sq_vector2); + { // Row 4 and Row 5 + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s1 += s1_stride; + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + s2 += s2_stride; + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + sq_vector2 = _mm256_madd_epi16(diff_vector, diff_vector); + sum = _mm256_add_epi32(sum, sq_vector2); + } + { // Row 6 and Row 7 + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + sq_vector2 = _mm256_madd_epi16(diff_vector, diff_vector); + sum = _mm256_add_epi32(sum, sq_vector2); + } + // Convert 16-bit integers to 32-bit integers for summation. + __m128i sum_low = _mm256_extracti128_si256(sum, 0); + __m128i sum_high = _mm256_extracti128_si256(sum, 1); + __m256i sum_low_64 = _mm256_cvtepi32_epi64(sum_low); + __m256i sum_high_64 = _mm256_cvtepi32_epi64(sum_high); + // Sum up all the values in the array to get final SSD value. + sum = _mm256_add_epi64(sum_low_64, sum_high_64); + _mm256_storeu_si256((__m256i*)sum_arr, sum); // store in array for summation. + ssd = sum_arr[0] + sum_arr[1] + sum_arr[2] + sum_arr[3]; + return ssd; } const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_avx[2] = { - ssd_16b_sse_8x8_avx, - NULL + ssd_16b_avx_8x8, + NULL }; +/* DIFF ***********************************************************************/ +static void diff_16b_avx_8x8(int w, int h, void* src1, void* src2, int s_src1, int s_src2, int s_diff, s16 *diff) +{ + s16* s1 = (s16*)src1; + s16* s2 = (s16*)src2; + __m256i s1_vector, s2_vector, diff_vector; + // Because we are working with 16 elements at a time, stride is multiplied by 2. + s16 s1_stride = 2 * s_src1; + s16 s2_stride = 2 * s_src2; + s16 diff_stride = 2 * s_diff; + { // Row 0 and Row 1 + // Load Row 0 and Row 1 data into registers. + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s1 += s1_stride; + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + s2 += s2_stride; + // Calculate difference between two rows and store it in diff buffer. + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + _mm256_storeu_si256((__m256i*)diff, diff_vector); + diff += diff_stride; + } + { // Row 2 and Row 3 + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s1 += s1_stride; + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + s2 += s2_stride; + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + _mm256_storeu_si256((__m256i*)diff, diff_vector); + diff += diff_stride; + } + { // Row 4 and Row 5 + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s1 += s1_stride; + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + s2 += s2_stride; + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + _mm256_storeu_si256((__m256i*)diff, diff_vector); + diff += diff_stride; + } + { // Row 6 and Row 7 + s1_vector = _mm256_loadu_si256((const __m256i*)(s1)); + s2_vector = _mm256_loadu_si256((const __m256i*)(s2)); + diff_vector = _mm256_sub_epi16(s1_vector, s2_vector); + _mm256_storeu_si256((__m256i*)diff, diff_vector); + } +} + +const oapv_fn_diff_t oapv_tbl_fn_diff_16b_avx[2] = +{ + diff_16b_avx_8x8, + NULL +}; #endif \ No newline at end of file diff --git a/src/avx/oapv_sad_avx.h b/src/avx/oapv_sad_avx.h index 8cc31cd..3165316 100644 --- a/src/avx/oapv_sad_avx.h +++ b/src/avx/oapv_sad_avx.h @@ -36,7 +36,9 @@ #include #if X86_SSE +extern const oapv_fn_sad_t oapv_tbl_fn_sad_16b_avx[2]; extern const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_avx[2]; +extern const oapv_fn_diff_t oapv_tbl_fn_diff_16b_avx[2]; #endif /* X86_SSE */ #endif /* _OAPV_SAD_AVX_H_ */ diff --git a/src/neon/oapv_sad_neon.c b/src/neon/oapv_sad_neon.c index edc2df0..f494ae7 100644 --- a/src/neon/oapv_sad_neon.c +++ b/src/neon/oapv_sad_neon.c @@ -34,8 +34,150 @@ #if ARM_NEON +/* SAD for 16bit **************************************************************/ +int sad_16b_neon_8x2n(int w, int h, void *src1, void *src2, int s_src1, int s_src2) +{ + int sad = 0; + s16* s1 = (s16*) src1; + s16* s2 = (s16*) src2; + int16x8_t s1_vector, s2_vector; + int32x4_t diff_part1, diff_part2, diff_part1_abs, diff_part2_abs, sad_vector, sad_vector_temp; + // Loop unrolled + { // Row 0 + // Loading one row (8 elements) each of src1 and src_2 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + // Subtracting s1_vector from s2_vector and storing in 32 bits + diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); + diff_part2 = vsubl_high_s16(s1_vector, s2_vector); + + //Taking absolute value of difference and adding them + diff_part1_abs = vabsq_s32(diff_part1); + diff_part2_abs = vabsq_s32(diff_part2); + + sad_vector = vaddq_s32(diff_part1_abs, diff_part2_abs); + } + { // Row 1 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); + diff_part2 = vsubl_high_s16(s1_vector, s2_vector); + + diff_part1_abs = vabsq_s32(diff_part1); + diff_part2_abs = vabsq_s32(diff_part2); + + sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); + // Updating sad_vector by adding the new values + sad_vector = vaddq_s32(sad_vector, sad_vector_temp); + } + { // Row 2 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); + diff_part2 = vsubl_high_s16(s1_vector, s2_vector); + + diff_part1_abs = vabsq_s32(diff_part1); + diff_part2_abs = vabsq_s32(diff_part2); + + sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); + sad_vector = vaddq_s32(sad_vector, sad_vector_temp); + } + { // Row 3 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); + diff_part2 = vsubl_high_s16(s1_vector, s2_vector); + + diff_part1_abs = vabsq_s32(diff_part1); + diff_part2_abs = vabsq_s32(diff_part2); + + sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); + sad_vector = vaddq_s32(sad_vector, sad_vector_temp); + } + { // Row 4 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); + diff_part2 = vsubl_high_s16(s1_vector, s2_vector); + + diff_part1_abs = vabsq_s32(diff_part1); + diff_part2_abs = vabsq_s32(diff_part2); + + sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); + sad_vector = vaddq_s32(sad_vector, sad_vector_temp); + } + { // Row 5 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); + diff_part2 = vsubl_high_s16(s1_vector, s2_vector); + + diff_part1_abs = vabsq_s32(diff_part1); + diff_part2_abs = vabsq_s32(diff_part2); + + sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); + sad_vector = vaddq_s32(sad_vector, sad_vector_temp); + } + { // Row 6 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); + diff_part2 = vsubl_high_s16(s1_vector, s2_vector); + + diff_part1_abs = vabsq_s32(diff_part1); + diff_part2_abs = vabsq_s32(diff_part2); + + sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); + sad_vector = vaddq_s32(sad_vector, sad_vector_temp); + } + { // Row 7 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); + diff_part2 = vsubl_high_s16(s1_vector, s2_vector); + + diff_part1_abs = vabsq_s32(diff_part1); + diff_part2_abs = vabsq_s32(diff_part2); + + sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); + sad_vector = vaddq_s32(sad_vector, sad_vector_temp); + } + // Adding all the elments in sad vector + sad = vaddvq_s32(sad_vector); + return sad; +} + +const oapv_fn_sad_t oapv_tbl_fn_sad_16b_neon[2] = { + sad_16b_neon_8x2n, + NULL +}; + /* SSD ***********************************************************************/ -static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth) +static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2) { s64 ssd = 0; s16* s1 = (s16*) src1; @@ -45,8 +187,8 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in int32x4_t diff1, diff2; int32x2_t diff1_low, diff2_low; int64x2_t sq_diff1_low, sq_diff1_high, sq_diff2_low, sq_diff2_high, sq_diff; - - { + // Loop unrolling + { // Row 0 s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); @@ -66,7 +208,7 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in sq_diff = vaddq_s64(sq_diff, sq_diff2_low); sq_diff = vaddq_s64(sq_diff, sq_diff2_high); } - { + { // Row 1 s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); @@ -87,7 +229,7 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in sq_diff = vaddq_s64(sq_diff, sq_diff2_low); sq_diff = vaddq_s64(sq_diff, sq_diff2_high); } - { + { // Row 2 s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); @@ -108,7 +250,7 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in sq_diff = vaddq_s64(sq_diff, sq_diff2_low); sq_diff = vaddq_s64(sq_diff, sq_diff2_high); } - { + { // Row 3 s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); @@ -129,7 +271,7 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in sq_diff = vaddq_s64(sq_diff, sq_diff2_low); sq_diff = vaddq_s64(sq_diff, sq_diff2_high); } - { + { // Row 4 s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); @@ -150,7 +292,7 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in sq_diff = vaddq_s64(sq_diff, sq_diff2_low); sq_diff = vaddq_s64(sq_diff, sq_diff2_high); } - { + { // Row 5 s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); @@ -171,7 +313,7 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in sq_diff = vaddq_s64(sq_diff, sq_diff2_low); sq_diff = vaddq_s64(sq_diff, sq_diff2_high); } - { + { // Row 6 s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); @@ -192,7 +334,7 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in sq_diff = vaddq_s64(sq_diff, sq_diff2_low); sq_diff = vaddq_s64(sq_diff, sq_diff2_high); } - { + { // Row 7 s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); @@ -222,6 +364,109 @@ const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_neon[2] = ssd_16b_neon_8x8, NULL}; +/* DIFF **********************************************************************/ +static void diff_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int s_diff, s16 *diff) +{ + s16* s1 = (s16*) src1; + s16* s2 = (s16*) src2; + int16x8_t s1_vector, s2_vector, diff_vector; + // Loop unrolled + { // Row 0 + // Loading one row (8 elements) each of src1 and src_2 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + // Subtracting s1_vector from s2_vector + diff_vector = vsubq_s16(s1_vector, s2_vector); + + // Storing the result in diff + vst1q_s16(diff, diff_vector); + diff += s_diff; + } + { // Row 1 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_vector = vsubq_s16(s1_vector, s2_vector); + + vst1q_s16(diff, diff_vector); + diff += s_diff; + } + { // Row 2 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_vector = vsubq_s16(s1_vector, s2_vector); + + vst1q_s16(diff, diff_vector); + diff += s_diff; + } + { // Row 3 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_vector = vsubq_s16(s1_vector, s2_vector); + + vst1q_s16(diff, diff_vector); + diff += s_diff; + } + { // Row 4 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_vector = vsubq_s16(s1_vector, s2_vector); + + vst1q_s16(diff, diff_vector); + diff += s_diff; + } + { // Row 5 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_vector = vsubq_s16(s1_vector, s2_vector); + + vst1q_s16(diff, diff_vector); + diff += s_diff; + } + { // Row 6 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_vector = vsubq_s16(s1_vector, s2_vector); + + vst1q_s16(diff, diff_vector); + diff += s_diff; + } + { // Row 7 + s1_vector = vld1q_s16(s1); + s1 += s_src1; + s2_vector = vld1q_s16(s2); + s2 += s_src2; + + diff_vector = vsubq_s16(s1_vector, s2_vector); + + vst1q_s16(diff, diff_vector); + diff += s_diff; + } +} +const oapv_fn_diff_t oapv_tbl_fn_diff_16b_neon[2] = { + diff_16b_neon_8x8, + NULL +}; int oapv_dc_removed_had8x8_neon(pel* org, int s_org) { diff --git a/src/neon/oapv_sad_neon.h b/src/neon/oapv_sad_neon.h index bde968e..addb9f0 100644 --- a/src/neon/oapv_sad_neon.h +++ b/src/neon/oapv_sad_neon.h @@ -36,7 +36,9 @@ #include "oapv_sad.h" #if ARM_NEON +extern const oapv_fn_sad_t oapv_tbl_fn_sad_16b_neon[2]; extern const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_neon[2]; +extern const oapv_fn_diff_t oapv_tbl_fn_diff_16b_neon[2]; int oapv_dc_removed_had8x8_neon(pel* org, int s_org); #endif /* ARM_NEON */ diff --git a/src/oapv.c b/src/oapv.c index e6c211b..024d6c4 100644 --- a/src/oapv.c +++ b/src/oapv.c @@ -217,6 +217,23 @@ static void copy_fi_to_finfo(oapv_fi_t *fi, int pbu_type, int group_id, oapv_frm finfo->capture_time_distance = fi->capture_time_distance; } +static void copy_fh_to_finfo(oapv_fh_t *fh, int pbu_type, int group_id, oapv_frm_info_t *finfo) +{ + copy_fi_to_finfo(&fh->fi, pbu_type, group_id, finfo); + finfo->use_q_matrix = fh->use_q_matrix; + for(int c = 0; c < OAPV_MAX_CC; c++) { + int mod = (1 << OAPV_LOG2_BLK) - 1; + for(int i = 0; i < OAPV_BLK_D; i++) { + finfo->q_matrix[c][i] = fh->q_matrix[c][i >> OAPV_LOG2_BLK][i & mod]; + } + } + finfo->color_description_present_flag = fh->color_description_present_flag; + finfo->color_primaries = fh->color_primaries; + finfo->transfer_characteristics = fh->transfer_characteristics; + finfo->matrix_coefficients = fh->matrix_coefficients; + finfo->full_range_flag = fh->full_range_flag; +} + /////////////////////////////////////////////////////////////////////////////// // start of encoder code #if ENABLE_ENCODER @@ -301,13 +318,11 @@ static double enc_block(oapve_ctx_t *ctx, oapve_core_t *core, int log2_w, int lo oapv_trans(ctx, core->coef, log2_w, log2_h, bit_depth); ctx->fn_quant[0](core->coef, core->qp[c], core->q_mat_enc[c], log2_w, log2_h, bit_depth, c ? 128 : 212); - int prev_dc = core->prev_dc[c]; + core->dc_diff = core->coef[0] - core->prev_dc[c]; core->prev_dc[c] = core->coef[0]; - core->coef[0] = core->coef[0] - prev_dc; if(ctx->rec) { oapv_mcpy(core->coef_rec, core->coef, sizeof(s16) * OAPV_BLK_D); - core->coef_rec[0] = core->coef_rec[0] + prev_dc; ctx->fn_dquant[0](core->coef_rec, core->q_mat_dec[c], log2_w, log2_h, core->dq_shift[c]); ctx->fn_itx[0](core->coef_rec, ITX_SHIFT1, ITX_SHIFT2(bit_depth), 1 << log2_w); } @@ -340,7 +355,7 @@ static double enc_block_rdo_slow(oapve_ctx_t *ctx, oapve_core_t *core, int log2_ oapv_mcpy(recon, coeff, sizeof(s16) * OAPV_BLK_D); ctx->fn_dquant[0](recon, core->q_mat_dec[c], log2_w, log2_h, core->dq_shift[c]); ctx->fn_itx[0](recon, ITX_SHIFT1, ITX_SHIFT2(bit_depth), 1 << log2_w); - int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w, bit_depth); + int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w); oapv_mcpy(best_coeff, coeff, sizeof(s16) * OAPV_BLK_D); if(ctx->rec) { oapv_mcpy(best_recon, recon, sizeof(s16) * OAPV_BLK_D); @@ -384,7 +399,7 @@ static double enc_block_rdo_slow(oapve_ctx_t *ctx, oapve_core_t *core, int log2_ oapv_mcpy(recon, coeff, sizeof(s16) * OAPV_BLK_D); ctx->fn_dquant[0](recon, core->q_mat_dec[c], log2_w, log2_h, core->dq_shift[c]); ctx->fn_itx[0](recon, ITX_SHIFT1, ITX_SHIFT2(bit_depth), 1 << log2_w); - int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w, bit_depth); + int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w); if(cost < best_cost) { best_cost = cost; @@ -404,9 +419,8 @@ static double enc_block_rdo_slow(oapve_ctx_t *ctx, oapve_core_t *core, int log2_ } } - int curr_dc = best_coeff[0]; - best_coeff[0] -= core->prev_dc[c]; - core->prev_dc[c] = curr_dc; + core->dc_diff = best_coeff[0] - core->prev_dc[c]; + core->prev_dc[c] = best_coeff[0]; return best_cost; } @@ -446,7 +460,7 @@ static double enc_block_rdo_medium(oapve_ctx_t *ctx, oapve_core_t *core, int log ctx->fn_itx_part[0](recon, tmp_buf, ITX_SHIFT1, 1 << log2_w); oapv_itx_get_wo_sft(tmp_buf, recon, rec_ups, ITX_SHIFT2(bit_depth), 1 << log2_h); - int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w, bit_depth); + int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w); oapv_mcpy(best_coeff, coeff, sizeof(s16) * OAPV_BLK_D); if(ctx->rec) { oapv_mcpy(best_recon, recon, sizeof(s16) * OAPV_BLK_D); @@ -499,7 +513,7 @@ static double enc_block_rdo_medium(oapve_ctx_t *ctx, oapve_core_t *core, int log recon[k] = (rec_tmp[k] + 512) >> 10; } - int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w, bit_depth); + int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w); if(cost < best_cost) { oapv_mcpy(rec_ups, rec_tmp, sizeof(int) * OAPV_BLK_D); best_cost = cost; @@ -522,9 +536,8 @@ static double enc_block_rdo_medium(oapve_ctx_t *ctx, oapve_core_t *core, int log ctx->fn_itx[0](best_recon, ITX_SHIFT1, ITX_SHIFT2(bit_depth), 1 << log2_w); } - int curr_dc = best_coeff[0]; - best_coeff[0] -= core->prev_dc[c]; - core->prev_dc[c] = curr_dc; + core->dc_diff = best_coeff[0] - core->prev_dc[c]; + core->prev_dc[c] = best_coeff[0]; return best_cost; } @@ -555,7 +568,7 @@ static double enc_block_rdo_placebo(oapve_ctx_t *ctx, oapve_core_t *core, int lo oapv_mcpy(recon, coeff, sizeof(s16) * OAPV_BLK_D); ctx->fn_dquant[0](recon, core->q_mat_dec[c], log2_w, log2_h, core->dq_shift[c]); ctx->fn_itx[0](recon, ITX_SHIFT1, ITX_SHIFT2(bit_depth), 1 << log2_w); - int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w, bit_depth); + int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w); oapv_mcpy(best_coeff, coeff, sizeof(s16) * OAPV_BLK_D); if(ctx->rec) { oapv_mcpy(best_recon, recon, sizeof(s16) * OAPV_BLK_D); @@ -599,7 +612,7 @@ static double enc_block_rdo_placebo(oapve_ctx_t *ctx, oapve_core_t *core, int lo oapv_mcpy(recon, coeff, sizeof(s16) * OAPV_BLK_D); ctx->fn_dquant[0](recon, core->q_mat_dec[c], log2_w, log2_h, core->dq_shift[c]); ctx->fn_itx[0](recon, ITX_SHIFT1, ITX_SHIFT2(bit_depth), 1 << log2_w); - int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w, bit_depth); + int cost = (int)ctx->fn_ssd[0](blk_w, blk_h, org, recon, blk_w, blk_w); if(cost < best_cost) { best_cost = cost; @@ -619,9 +632,8 @@ static double enc_block_rdo_placebo(oapve_ctx_t *ctx, oapve_core_t *core, int lo } } - int curr_dc = best_coeff[0]; - best_coeff[0] -= core->prev_dc[c]; - core->prev_dc[c] = curr_dc; + core->dc_diff = best_coeff[0] - core->prev_dc[c]; + core->prev_dc[c] = best_coeff[0]; return best_cost; } @@ -633,33 +645,33 @@ static int enc_read_param(oapve_ctx_t *ctx, oapve_param_t *param) oapv_assert_rv(param->qp >= MIN_QUANT && param->qp <= MAX_QUANT, OAPV_ERR_INVALID_ARGUMENT); ctx->qp[Y_C] = param->qp; - ctx->qp[U_C] = param->qp + param->qp_cb_offset; - ctx->qp[V_C] = param->qp + param->qp_cr_offset; + ctx->qp[U_C] = oapv_clip3(MIN_QUANT, MAX_QUANT, param->qp + param->qp_cb_offset); + ctx->qp[V_C] = oapv_clip3(MIN_QUANT, MAX_QUANT, param->qp + param->qp_cr_offset); ctx->qp[X_C] = param->qp; ctx->num_comp = get_num_comp(param->csp); if(param->preset == OAPV_PRESET_SLOW) { - ctx->fn_block = enc_block_rdo_slow; + ctx->fn_enc_blk = enc_block_rdo_slow; } else if(param->preset == OAPV_PRESET_PLACEBO) { - ctx->fn_block = enc_block_rdo_placebo; + ctx->fn_enc_blk = enc_block_rdo_placebo; } else if(param->preset == OAPV_PRESET_MEDIUM) { - ctx->fn_block = enc_block_rdo_medium; + ctx->fn_enc_blk = enc_block_rdo_medium; } else { - ctx->fn_block = enc_block; + ctx->fn_enc_blk = enc_block; } ctx->log2_block = OAPV_LOG2_BLK; /* set various value */ - ctx->w = ((ctx->param->w + (OAPV_MB_W - 1)) >> OAPV_LOG2_MB_W) << OAPV_LOG2_MB_W; - ctx->h = ((ctx->param->h + (OAPV_MB_H - 1)) >> OAPV_LOG2_MB_H) << OAPV_LOG2_MB_H; + ctx->w = ((param->w + (OAPV_MB_W - 1)) >> OAPV_LOG2_MB_W) << OAPV_LOG2_MB_W; + ctx->h = ((param->h + (OAPV_MB_H - 1)) >> OAPV_LOG2_MB_H) << OAPV_LOG2_MB_H; - int tile_w = ctx->param->tile_w_mb * OAPV_MB_W; - int tile_h = ctx->param->tile_h_mb * OAPV_MB_H; + int tile_w = param->tile_w_mb * OAPV_MB_W; + int tile_h = param->tile_h_mb * OAPV_MB_H; enc_set_tile_info(ctx->tile, ctx->w, ctx->h, tile_w, tile_h, &ctx->num_tile_cols, &ctx->num_tile_rows, &ctx->num_tiles); return OAPV_OK; @@ -762,16 +774,16 @@ static int enc_tile_comp(oapv_bs_t *bs, oapve_tile_t *tile, oapve_ctx_t *ctx, oa for(blk_y = mb_y; blk_y < (mb_y + mb_h); blk_y += OAPV_BLK_H) { for(blk_x = mb_x; blk_x < (mb_x + mb_w); blk_x += OAPV_BLK_W) { o16 = (s16 *)((u8 *)org + blk_y * s_org) + blk_x; - ctx->fn_imgb_to_block[c](o16, OAPV_BLK_W, OAPV_BLK_H, s_org, blk_x, (OAPV_BLK_W << 1), core->coef); + ctx->fn_imgb_to_blk[c](o16, OAPV_BLK_W, OAPV_BLK_H, s_org, blk_x, (OAPV_BLK_W << 1), core->coef); - ctx->fn_block(ctx, core, OAPV_LOG2_BLK_W, OAPV_LOG2_BLK_H, c); - oapve_vlc_dc_coeff(ctx, core, bs, core->coef[0], c); + ctx->fn_enc_blk(ctx, core, OAPV_LOG2_BLK_W, OAPV_LOG2_BLK_H, c); + oapve_vlc_dc_coeff(ctx, core, bs, core->dc_diff, c); oapve_vlc_ac_coeff(ctx, core, bs, core->coef, 0, c); DUMP_COEF(core->coef, OAPV_BLK_D, blk_x, blk_y, c); if(rec != NULL) { r16 = (s16 *)((u8 *)rec + blk_y * s_rec) + blk_x; - ctx->fn_block_to_imgb[c](core->coef_rec, OAPV_BLK_W, OAPV_BLK_H, (OAPV_BLK_W << 1), blk_x, s_rec, r16); + ctx->fn_blk_to_imgb[c](core->coef_rec, OAPV_BLK_W, OAPV_BLK_H, (OAPV_BLK_W << 1), blk_x, s_rec, r16); } } } @@ -820,7 +832,7 @@ static int enc_tile(oapve_ctx_t *ctx, oapve_core_t *core, oapve_tile_t *tile) } } - if(ctx->rec || ctx->param->preset > OAPV_PRESET_MEDIUM) { + if(ctx->rec || ctx->param->preset >= OAPV_PRESET_MEDIUM) { core->dq_shift[c] = ctx->bit_depth - 2 - (core->qp[c] / 6); int cnt = 0; @@ -1026,22 +1038,22 @@ static int enc_frm_prepare(oapve_ctx_t *ctx, oapv_imgb_t *imgb_i, oapv_imgb_t *i ctx->bit_depth = OAPV_CS_GET_BIT_DEPTH(imgb_i->cs); if(OAPV_CS_GET_FORMAT(imgb_i->cs) == OAPV_CF_PLANAR2) { - ctx->fn_imgb_to_block_rc = imgb_to_block_p210; + ctx->fn_imgb_to_blk_rc = imgb_to_block_p210; - ctx->fn_imgb_to_block[Y_C] = imgb_to_block_p210_y; - ctx->fn_imgb_to_block[U_C] = imgb_to_block_p210_uv; - ctx->fn_imgb_to_block[V_C] = imgb_to_block_p210_uv; + ctx->fn_imgb_to_blk[Y_C] = imgb_to_block_p210_y; + ctx->fn_imgb_to_blk[U_C] = imgb_to_block_p210_uv; + ctx->fn_imgb_to_blk[V_C] = imgb_to_block_p210_uv; - ctx->fn_block_to_imgb[Y_C] = block_to_imgb_p210_y; - ctx->fn_block_to_imgb[U_C] = block_to_imgb_p210_uv; - ctx->fn_block_to_imgb[V_C] = block_to_imgb_p210_uv; + ctx->fn_blk_to_imgb[Y_C] = block_to_imgb_p210_y; + ctx->fn_blk_to_imgb[U_C] = block_to_imgb_p210_uv; + ctx->fn_blk_to_imgb[V_C] = block_to_imgb_p210_uv; ctx->fn_img_pad = enc_img_pad_p210; } else { - ctx->fn_imgb_to_block_rc = imgb_to_block; + ctx->fn_imgb_to_blk_rc = imgb_to_block; for(int i = 0; i < ctx->num_comp; i++) { - ctx->fn_imgb_to_block[i] = imgb_to_block_10bit; - ctx->fn_block_to_imgb[i] = block_to_imgb_10bit; + ctx->fn_imgb_to_blk[i] = imgb_to_block_10bit; + ctx->fn_blk_to_imgb[i] = block_to_imgb_10bit; } ctx->fn_img_pad = enc_img_pad; } @@ -1123,13 +1135,14 @@ static int enc_frame(oapve_ctx_t *ctx) ctx->rc_param.lambda = oapve_rc_estimate_pic_lambda(ctx, cost_sum); ctx->rc_param.qp = oapve_rc_estimate_pic_qp(ctx->rc_param.lambda); + printf("QP=%d\n", ctx->rc_param.qp); for(int c = 0; c < ctx->num_comp; c++) { ctx->qp[c] = ctx->rc_param.qp; if(c == 1) { - ctx->qp[c] += ctx->param->qp_cb_offset; + ctx->qp[c] = oapv_clip3(MIN_QUANT, MAX_QUANT, ctx->qp[c] + ctx->param->qp_cb_offset); } else if(c == 2) { - ctx->qp[c] += ctx->param->qp_cr_offset; + ctx->qp[c] = oapv_clip3(MIN_QUANT, MAX_QUANT, ctx->qp[c] + ctx->param->qp_cr_offset); } } } @@ -1162,6 +1175,8 @@ static int enc_frame(oapve_ctx_t *ctx) /* rewrite frame header */ if(ctx->fh.tile_size_present_in_fh_flag) { oapve_vlc_frame_header(&bs_fh, ctx, &ctx->fh); + /* de-init BSW */ + oapv_bsw_sink(&bs_fh); } if(ctx->param->rc_type != 0) { oapve_rc_update_after_pic(ctx, cost_sum); @@ -1193,7 +1208,9 @@ static int enc_platform_init(oapve_ctx_t *ctx) support_avx2 = (check_cpu >> 2) & 1; if(support_avx2) { + ctx->fn_sad = oapv_tbl_fn_sad_16b_avx; ctx->fn_ssd = oapv_tbl_fn_ssd_16b_avx; + ctx->fn_diff = oapv_tbl_fn_diff_16b_avx; ctx->fn_itx_part = oapv_tbl_fn_itx_part_avx; ctx->fn_itx = oapv_tbl_fn_itx_avx; ctx->fn_itx_adj = oapv_tbl_fn_itx_adj_avx; @@ -1207,7 +1224,9 @@ static int enc_platform_init(oapve_ctx_t *ctx) ctx->fn_had8x8 = oapv_dc_removed_had8x8_sse; } #elif ARM_NEON + ctx->fn_sad = oapv_tbl_fn_sad_16b_neon; ctx->fn_ssd = oapv_tbl_fn_ssd_16b_neon; + ctx->fn_diff = oapv_tbl_fn_diff_16b_neon; ctx->fn_itx = oapv_tbl_fn_itx_neon; ctx->fn_txb = oapv_tbl_fn_txb_neon; ctx->fn_quant = oapv_tbl_fn_quant_neon; @@ -1317,7 +1336,7 @@ int oapve_encode(oapve_t eid, oapv_frms_t *ifrms, oapvm_t mid, oapv_bitb_t *bitb DUMP_LOAD(1); stat->frm_size[i] = pbu_size + 4 /* PUB size length*/; - copy_fi_to_finfo(&ctx->fh.fi, frm->pbu_type, frm->group_id, &stat->aui.frm_info[i]); + copy_fh_to_finfo(&ctx->fh, frm->pbu_type, frm->group_id, &stat->aui.frm_info[i]); // add frame hash value of reconstructed frame into metadata list if(ctx->use_frm_hash) { @@ -1453,6 +1472,20 @@ int oapve_param_default(oapve_param_t *param) param->level_idc = (int)(4.1 * 30); param->band_idc = 2; + param->use_q_matrix = 0; + + param->color_description_present_flag = 0; + param->color_primaries = 2; // unspecified color primaries + param->transfer_characteristics = 2; // unspecified transfer characteristics + param->matrix_coefficients = 2; // unspecified matrix coefficients + param->full_range_flag = 0; // limited range + + for(int c = 0; c < OAPV_MAX_CC; c++) { + for(int i = 0; i < OAPV_BLK_D; i++) { + param->q_matrix[c][i] = 16; + } + } + return OAPV_OK; } @@ -1513,7 +1546,7 @@ static int dec_block(oapvd_ctx_t *ctx, oapvd_core_t *core, int log2_w, int log2_ int bit_depth = ctx->bit_depth; // DC prediction - core->coef[0] += core->prev_dc[c]; + core->coef[0] = core->dc_diff + core->prev_dc[c]; core->prev_dc[c] = core->coef[0]; // Inverse quantization ctx->fn_dquant[0](core->coef, core->q_mat[c], log2_w, log2_h, core->dq_shift[c]); @@ -1617,7 +1650,7 @@ static int dec_tile_comp(oapvd_tile_t *tile, oapvd_ctx_t *ctx, oapvd_core_t *cor for(blk_y = mb_y; blk_y < (mb_y + mb_h); blk_y += OAPV_BLK_H) { for(blk_x = mb_x; blk_x < (mb_x + mb_w); blk_x += OAPV_BLK_W) { // parse DC coefficient - ret = oapvd_vlc_dc_coeff(ctx, core, bs, &core->coef[0], c); + ret = oapvd_vlc_dc_coeff(ctx, core, bs, &core->dc_diff, c); oapv_assert_rv(OAPV_SUCCEEDED(ret), ret); // parse AC coefficient @@ -1945,6 +1978,9 @@ int oapvd_decode(oapvd_t did, oapv_bitb_t *bitb, oapv_frms_t *ofrms, oapvm_t mid pbuh.pbu_type == OAPV_PBU_TYPE_PREVIEW_FRAME || pbuh.pbu_type == OAPV_PBU_TYPE_DEPTH_FRAME || pbuh.pbu_type == OAPV_PBU_TYPE_ALPHA_FRAME) { + + oapv_assert_rv(frame_cnt < OAPV_MAX_NUM_FRAMES, OAPV_ERR_REACHED_MAX); + ret = oapvd_vlc_frame_header(bs, &ctx->fh); oapv_assert_g(OAPV_SUCCEEDED(ret), ERR); @@ -1975,9 +2011,9 @@ int oapvd_decode(oapvd_t did, oapv_bitb_t *bitb, oapv_frms_t *ofrms, oapvm_t mid /* READ FILLER HERE !!! */ oapv_bsr_move(&ctx->bs, ctx->tile_end); - stat->read += bsr_get_read_byte(&ctx->bs); + stat->read += BSR_GET_READ_BYTE(&ctx->bs); - copy_fi_to_finfo(&ctx->fh.fi, pbuh.pbu_type, pbuh.group_id, &stat->aui.frm_info[frame_cnt]); + copy_fh_to_finfo(&ctx->fh, pbuh.pbu_type, pbuh.group_id, &stat->aui.frm_info[frame_cnt]); if(ret == OAPV_OK && ctx->use_frm_hash) { oapv_imgb_set_md5(ctx->imgb); } @@ -1993,7 +2029,7 @@ int oapvd_decode(oapvd_t did, oapv_bitb_t *bitb, oapv_frms_t *ofrms, oapvm_t mid ret = oapvd_vlc_metadata(bs, pbu_size, mid, pbuh.group_id); oapv_assert_g(OAPV_SUCCEEDED(ret), ERR); - stat->read += bsr_get_read_byte(&ctx->bs); + stat->read += BSR_GET_READ_BYTE(&ctx->bs); } else if(pbuh.pbu_type == OAPV_PBU_TYPE_FILLER) { ret = oapvd_vlc_filler(bs, (pbu_size - 4)); diff --git a/src/oapv_bs.c b/src/oapv_bs.c index f9d068d..c3c47c8 100644 --- a/src/oapv_bs.c +++ b/src/oapv_bs.c @@ -197,9 +197,8 @@ static int bsr_flush(oapv_bs_t *bs, int byte) bs->leftbits = byte << 3; - bs->cur += byte; while(byte) { - code |= *(bs->cur - byte) << shift; + code |= *(bs->cur++) << shift; byte--; shift -= 8; } @@ -237,6 +236,17 @@ int oapv_bsr_clz_in_code(u32 code) return clz; } +int oapv_bsr_clz(oapv_bs_t *bs) +{ + int clz; + u32 code; + + code = oapv_bsr_peek(bs, 32); + oapv_assert(code != 0); + clz = oapv_bsr_clz_in_code(code); + return clz; +} + void oapv_bsr_align8(oapv_bs_t *bs) { /* @@ -266,7 +276,7 @@ void oapv_bsr_skip(oapv_bs_t *bs, int size) bsr_skip_code(bs, size); } -void oapv_bsr_peek(oapv_bs_t *bs, u32 *val, int size) +u32 oapv_bsr_peek(oapv_bs_t *bs, int size) { int byte, leftbits; u32 code = 0; @@ -302,7 +312,7 @@ void oapv_bsr_peek(oapv_bs_t *bs, u32 *val, int size) code |= *(bs->cur) >> (8 - size); } } - *val = code; + return code; } void *oapv_bsr_sink(oapv_bs_t *bs) diff --git a/src/oapv_bs.h b/src/oapv_bs.h index c7fcc0f..bc2a4d3 100644 --- a/src/oapv_bs.h +++ b/src/oapv_bs.h @@ -81,28 +81,60 @@ int oapv_bsw_write(oapv_bs_t *bs, u32 val, int len); // start of decoder code #if ENABLE_DECODER /////////////////////////////////////////////////////////////////////////////// -/*! is bitstream byte aligned? */ -static bool inline bsr_is_align8(oapv_bs_t *bs) -{ - return ((bs->leftbits & 0x7) == 0) ? true : false; -} +#if 0 +#if defined(X86F) || defined(ARMV8N_64) +/* on X86 machine, 32-bit shift means remaining of original value, so we +should set zero in that case. */ +#define BSR_SKIP_CODE(bs, size) \ + oapv_assert((bs)->leftbits >= (size)); \ + if((size) == 32) {(bs)->code = 0; (bs)->leftbits = 0;} \ + else {(bs)->code <<= (size); (bs)->leftbits -= (size);} +#else +#define BSR_SKIP_CODE(bs, size) \ + oapv_assert((bs)->leftbits >= (size)); \ + (bs)->code <<= (size); (bs)->leftbits -= (size); +#endif +#else +#define BSR_SKIP_CODE(bs, size) \ + oapv_assert((bs)->leftbits >= (size) && (size) <= 32); \ + (bs)->code <<= (size); (bs)->leftbits -= (size); +#endif + +/*! Is end of bitstream ? */ +#define BSR_IS_EOB(bs) (((bs)->cur > (bs)->end && (bs)->leftbits==0)? 1: 0) + +/*! Is bitstream byte aligned? */ +#define BSR_IS_BYTE_ALIGN(bs) ((((bs)->leftbits & 0x7) == 0)? 1: 0) + +/*! Is last byte of bitsteam? */ +#define BSR_IS_LAST_BYTE(bs) \ + (((bs)->cur > (bs)->end && bs->leftbits > 0 && (bs)->leftbits <= 8)? 1: 0) +/* get left byte count in BS */ +#define BSR_GET_LEFT_BYTE(bs) \ + ((int)((bs)->end - (bs)->cur) + 1 + ((bs)->leftbits >> 3)) /* get number of byte consumed */ -static int inline bsr_get_read_byte(oapv_bs_t *bs) -{ - return ((int)((bs)->cur - (bs)->beg) - ((bs)->leftbits >> 3)); -} +#define BSR_GET_READ_BYTE(bs) \ + ((int)((bs)->cur - (bs)->beg) - ((bs)->leftbits >> 3)) +/* get number of bit consumed */ +#define BSR_GET_READ_BIT(bs) \ + (((int)((bs)->cur - (bs)->beg) << 3) - ((bs)->leftbits)) -static int inline bsr_get_remained_byte(oapv_bs_t *bs) -{ - return (bs->size - bsr_get_read_byte(bs)); -} +/* get address of current reading */ +#define BSR_GET_CUR(bs) ((bs)->cur - (((bs)->leftbits + 7) >> 3)) + +/* move to # bytes align position */ +#define BSR_MOVE_BYTE_ALIGN(bs, byte) \ + (bs)->cur += (byte) - ((bs)->leftbits >> 3); \ + (bs)->code = 0; \ + (bs)->leftbits = 0; void oapv_bsr_init(oapv_bs_t *bs, u8 *buf, int size, oapv_bs_fn_flush_t fn_flush); int oapv_bsr_clz_in_code(u32 code); +int oapv_bsr_clz(oapv_bs_t *bs); void oapv_bsr_align8(oapv_bs_t *bs); void oapv_bsr_skip(oapv_bs_t *bs, int size); -void oapv_bsr_peek(oapv_bs_t *bs, u32 *val, int size); +u32 oapv_bsr_peek(oapv_bs_t *bs, int size); void *oapv_bsr_sink(oapv_bs_t *bs); void oapv_bsr_move(oapv_bs_t *bs, u8 *pos); u32 oapv_bsr_read(oapv_bs_t *bs, int size); diff --git a/src/oapv_def.h b/src/oapv_def.h index f6b1429..a70dd63 100644 --- a/src/oapv_def.h +++ b/src/oapv_def.h @@ -105,8 +105,8 @@ struct oapv_fi { // 112byte int level_idc; /* u( 8) */ int band_idc; /* u( 3) */ // int reserved_zero_5bits; /* u( 5) */ - u32 frame_width; /* u(32) */ - u32 frame_height; /* u(32) */ + u32 frame_width; /* u(24) */ + u32 frame_height; /* u(24) */ int chroma_format_idc; /* u( 4) */ int bit_depth; /* u( 4) */ int capture_time_distance; /* u( 8) */ @@ -124,13 +124,14 @@ struct oapv_fh { int color_primaries; /* u( 8) */ int transfer_characteristics; /* u( 8) */ int matrix_coefficients; /* u( 8) */ + int full_range_flag; /* u( 1) */ int use_q_matrix; /* u( 1) */ /* (start) quantization_matix */ - int q_matrix[N_C][OAPV_BLK_H][OAPV_BLK_W]; /* u( 8) minus 1*/ + int q_matrix[N_C][OAPV_BLK_H][OAPV_BLK_W]; /* u( 8) */ /* ( end ) quantization_matix */ /* (start) tile_info */ - int tile_width_in_mbs; /* u(28) minus 1*/ - int tile_height_in_mbs; /* u(28) minus 1*/ + int tile_width_in_mbs; /* u(20) */ + int tile_height_in_mbs; /* u(20) */ int tile_size_present_in_fh_flag; /* u( 1) */ u32 tile_size[OAPV_MAX_TILES]; /* u(32) */ /* ( end ) tile_info */ @@ -182,16 +183,16 @@ typedef void (*oapv_fn_tx_t)(s16 *coef, s16 *t, int shift, int line); typedef void (*oapv_fn_itx_adj_t)(int *src, int *dst, int itrans_diff_idx, int diff_step, int shift); typedef int (*oapv_fn_quant_t)(s16 *coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset); typedef void (*oapv_fn_dquant_t)(s16 *coef, s16 q_matrix[OAPV_BLK_D], int log2_w, int log2_h, s8 shift); -typedef int (*oapv_fn_sad_t)(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth); -typedef s64 (*oapv_fn_ssd_t)(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth); -typedef void (*oapv_fn_diff_t)(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int s_diff, s16 *diff, int bit_depth); +typedef int (*oapv_fn_sad_t)(int w, int h, void *src1, void *src2, int s_src1, int s_src2); +typedef s64 (*oapv_fn_ssd_t)(int w, int h, void *src1, void *src2, int s_src1, int s_src2); +typedef void (*oapv_fn_diff_t)(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int s_diff, s16 *diff); -typedef double (*oapv_fn_block_cost_t)(oapve_ctx_t *ctx, oapve_core_t *core, int log2_w, int log2_h, int c); -typedef void (*oapv_fn_imgb_to_block_rc)(oapv_imgb_t *imgb, int c, int x_l, int y_l, int w_l, int h_l, s16 *block); -typedef void (*oapv_fn_imgb_to_block)(void *src, int blk_w, int blk_h, int s_src, int offset_src, int s_dst, void *dst); -typedef void (*oapv_fn_block_to_imgb)(void *src, int blk_w, int blk_h, int s_src, int offset_dst, int s_dst, void *dst); -typedef void (*oapv_fn_img_pad)(oapve_ctx_t *ctx, oapv_imgb_t *imgb); -typedef int (*oapv_fn_had8x8)(pel *org, int s_org); +typedef double (*oapv_fn_enc_blk_cost_t)(oapve_ctx_t *ctx, oapve_core_t *core, int log2_w, int log2_h, int c); +typedef void (*oapv_fn_imgb_to_blk_rc_t)(oapv_imgb_t *imgb, int c, int x_l, int y_l, int w_l, int h_l, s16 *block); +typedef void (*oapv_fn_imgb_to_blk_t)(void *src, int blk_w, int blk_h, int s_src, int offset_src, int s_dst, void *dst); +typedef void (*oapv_fn_blk_to_imgb_t)(void *src, int blk_w, int blk_h, int s_src, int offset_dst, int s_dst, void *dst); +typedef void (*oapv_fn_img_pad_t)(oapve_ctx_t *ctx, oapv_imgb_t *imgb); +typedef int (*oapv_fn_had8x8_t)(pel *org, int s_org); /***************************************************************************** * rate-control related @@ -230,7 +231,8 @@ struct oapve_core { int prev_1st_ac_ctx[N_C]; int tile_idx; int prev_dc[N_C]; - + int dc_diff; /* DC difference, which is represented in 17 bits */ + /* and coded as abs_dc_coeff_diff and sign_dc_coeff_diff */ int qp[N_C]; // QPs for Y, Cb(U), Cr(V) int dq_shift[N_C]; @@ -300,15 +302,14 @@ struct oapve_ctx { const oapv_fn_sad_t *fn_sad; const oapv_fn_ssd_t *fn_ssd; const oapv_fn_diff_t *fn_diff; - oapv_fn_imgb_to_block_rc fn_imgb_to_block_rc; - oapv_fn_imgb_to_block fn_imgb_to_block[N_C]; - oapv_fn_block_to_imgb fn_block_to_imgb[N_C]; - oapv_fn_img_pad fn_img_pad; - oapv_fn_block_cost_t fn_block; - oapv_fn_had8x8 fn_had8x8; - int use_frm_hash; - void *tx_tbl; + oapv_fn_imgb_to_blk_rc_t fn_imgb_to_blk_rc; + oapv_fn_imgb_to_blk_t fn_imgb_to_blk[N_C]; + oapv_fn_blk_to_imgb_t fn_blk_to_imgb[N_C]; + oapv_fn_img_pad_t fn_img_pad; + oapv_fn_enc_blk_cost_t fn_enc_blk; + oapv_fn_had8x8_t fn_had8x8; + int use_frm_hash; oapve_rc_param_t rc_param; /* platform specific data, if needed */ @@ -336,7 +337,7 @@ struct oapvd_tile { int y; /* y (row) position in a frame in unit of pixel */ int w; /* tile width in unit of pixel */ int h; /* tile height in unit of pixel */ - u32 data_size; /* tile size including tile_size_minus1 syntax */ + u32 data_size; /* tile size including tile_size syntax */ u8 *bs_beg; /* start position of tile in input bistream */ u8 *bs_end; /* end position of tile() in input bistream */ @@ -353,6 +354,8 @@ struct oapvd_core { int prev_dc_ctx[N_C]; int prev_1st_ac_ctx[N_C]; int prev_dc[N_C]; + int dc_diff; /* DC difference, which is represented in 17 bits */ + /* and coded as abs_dc_coeff_diff and sign_dc_coeff_diff */ int qp[N_C]; int dq_shift[N_C]; s16 q_mat[N_C][OAPV_BLK_D]; @@ -372,7 +375,7 @@ struct oapvd_ctx { oapv_imgb_t *imgb; const oapv_fn_itx_t *fn_itx; const oapv_fn_dquant_t *fn_dquant; - oapv_fn_block_to_imgb fn_block_to_imgb[N_C]; + oapv_fn_blk_to_imgb_t fn_block_to_imgb[N_C]; oapv_bs_t bs; oapv_fh_t fh; diff --git a/src/oapv_metadata.c b/src/oapv_metadata.c index aea2023..d56fd90 100644 --- a/src/oapv_metadata.c +++ b/src/oapv_metadata.c @@ -40,6 +40,22 @@ static oapvm_ctx_t *meta_id_to_ctx(oapvm_t id) oapv_assert_rv(ctx->magic == OAPVM_MAGIC_CODE, NULL); return ctx; } +#define div_255_fast(x) (((x) + (((x) + 257) >> 8)) >> 8) + +static inline u32 meta_get_byte_pld_type(oapv_mdp_t *mdp) +{ + return (mdp->pld_type < 65536 ? div_255_fast(mdp->pld_type) : mdp->pld_type / 255) + 1; +} + +static inline u32 meta_get_byte_pld_size(oapv_mdp_t *mdp) +{ + return (mdp->pld_size < 65536 ? div_255_fast(mdp->pld_size) : mdp->pld_size / 255) + 1; +} + +static inline u32 meta_get_byte_pld_all(oapv_mdp_t *mdp) +{ + return meta_get_byte_pld_type(mdp) + meta_get_byte_pld_size(mdp) + mdp->pld_size; +} static oapv_mdp_t **meta_mdp_find_last_with_check(oapv_md_t *md, int type, unsigned char *uuid) { @@ -101,6 +117,7 @@ static int meta_md_rm_mdp(oapv_md_t *md, int mdt) mdp_prev->next = mdp->next; } meta_md_free_mdp(mdp); + md->md_size -= meta_get_byte_pld_all(mdp); md->md_num--; return OAPV_OK; } @@ -122,8 +139,8 @@ static int meta_md_rm_usd(oapv_md_t *md, unsigned char *uuid) mdp_prev->next = mdp->next; } oapv_assert_rv(md->md_size >= mdp->pld_size, OAPV_ERR_UNEXPECTED); - md->md_size -= mdp->pld_size; meta_md_free_mdp(mdp); + md->md_size -= meta_get_byte_pld_all(mdp); md->md_num--; return OAPV_OK; } @@ -200,7 +217,7 @@ static void meta_free_md(oapv_md_t *md) int oapvm_set(oapvm_t mid, int group_id, int type, void *data, int size, unsigned char *uuid) { oapvm_ctx_t *md_list = meta_id_to_ctx(mid); - + oapv_assert_rv(md_list, OAPV_ERR_INVALID_ARGUMENT); int ret = meta_verify_mdp_data(type, size, (u8 *)data); oapv_assert_rv(OAPV_SUCCEEDED(ret), ret); @@ -230,24 +247,7 @@ int oapvm_set(oapvm_t mid, int group_id, int type, void *data, int size, unsigne tmp_mdp->pld_type = type; tmp_mdp->pld_data = data; *last_ptr = tmp_mdp; - - /* calculate length of payload type */ - int tmp_mpt = type; - while(tmp_mpt >= 255) { - tmp_mpt -= 255; - md_list->md_arr[md_list_idx].md_size += 1; - } - md_list->md_arr[md_list_idx].md_size += 1; - - /* calculate length of payload data size */ - int tmp_mps = size; - while(tmp_mps >= 255) { - tmp_mps -= 255; - md_list->md_arr[md_list_idx].md_size += 1; - } - md_list->md_arr[md_list_idx].md_size += 1; - - md_list->md_arr[md_list_idx].md_size += tmp_mdp->pld_size; + md_list->md_arr[md_list_idx].md_size += meta_get_byte_pld_all(tmp_mdp); md_list->md_arr[md_list_idx].md_num++; return OAPV_OK; } @@ -320,7 +320,7 @@ int oapvm_set_all(oapvm_t mid, oapvm_payload_t *pld, int num_plds) tmp_mdp->pld_size = pld[i].data_size; tmp_mdp->pld_type = pld[i].type; tmp_mdp->pld_data = pld[i].data; - md_list->md_arr[md_list_idx].md_size += tmp_mdp->pld_size; + md_list->md_arr[md_list_idx].md_size += meta_get_byte_pld_all(tmp_mdp); *last_ptr = tmp_mdp; } diff --git a/src/oapv_rc.c b/src/oapv_rc.c index afe6385..782cc7e 100644 --- a/src/oapv_rc.c +++ b/src/oapv_rc.c @@ -46,7 +46,7 @@ int oapve_rc_get_tile_cost(oapve_ctx_t* ctx, oapve_core_t* core, oapve_tile_t* t int tx = tile->x + x; int ty = tile->y + y; - ctx->fn_imgb_to_block_rc(ctx->imgb, c, tx, ty, 8, 8, core->coef); + ctx->fn_imgb_to_blk_rc(ctx->imgb, c, tx, ty, 8, 8, core->coef); sum += ctx->fn_had8x8(core->coef, 8); tile->rc.number_pixel += 64; } @@ -184,7 +184,7 @@ void oapve_rc_get_qp(oapve_ctx_t* ctx, oapve_tile_t* tile, int frame_qp, int* qp *qp = (int)(4.2005 * log(est_lambda) + 13.7122 + 0.5); *qp = oapv_clip3(min_qp, max_qp, *qp); *qp += OAPV_RC_QP_OFFSET; - + *qp = oapv_clip3(MIN_QUANT, MAX_QUANT, *qp); } void oapve_rc_update_after_pic(oapve_ctx_t* ctx, double cost) diff --git a/src/oapv_sad.c b/src/oapv_sad.c index a8ae8a0..de9992e 100644 --- a/src/oapv_sad.c +++ b/src/oapv_sad.c @@ -33,13 +33,13 @@ #include /* SAD for 16bit **************************************************************/ -int oapv_sad_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth) +int oapv_sad_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2) { - u16 *s1; + s16 *s1; s16 *s2; int i, j, sad; - s1 = (u16 *)src1; + s1 = (s16 *)src1; s2 = (s16 *)src2; sad = 0; @@ -52,7 +52,7 @@ int oapv_sad_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2, i s2 += s_src2; } - return (sad >> (bit_depth - 8)); + return sad; } const oapv_fn_sad_t oapv_tbl_fn_sad_16b[2] = { @@ -61,7 +61,7 @@ const oapv_fn_sad_t oapv_tbl_fn_sad_16b[2] = { }; /* DIFF **********************************************************************/ -void oapv_diff_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int s_diff, s16 *diff, int bit_depth) +void oapv_diff_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int s_diff, s16 *diff) { s16 *s1; s16 *s2; @@ -86,7 +86,7 @@ const oapv_fn_diff_t oapv_tbl_fn_diff_16b[2] = { }; /* SSD ***********************************************************************/ -s64 oapv_ssd_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth) +s64 oapv_ssd_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2) { s16 *s1; s16 *s2; diff --git a/src/oapv_sad.h b/src/oapv_sad.h index 7f67707..dbdd309 100644 --- a/src/oapv_sad.h +++ b/src/oapv_sad.h @@ -34,9 +34,9 @@ #include "oapv_port.h" -int oapv_sad_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth); -void oapv_diff_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int s_diff, s16 *diff, int bit_depth); -s64 oapv_ssd_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth); +int oapv_sad_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2); +void oapv_diff_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int s_diff, s16 *diff); +s64 oapv_ssd_16b(int w, int h, void *src1, void *src2, int s_src1, int s_src2); int oapv_dc_removed_had8x8(pel *org, int s_org); extern const oapv_fn_sad_t oapv_tbl_fn_sad_16b[2]; diff --git a/src/oapv_vlc.c b/src/oapv_vlc.c index 61ed32a..cfef359 100644 --- a/src/oapv_vlc.c +++ b/src/oapv_vlc.c @@ -205,6 +205,7 @@ static int dec_vlc_read_1bit_read(oapv_bs_t *bs, int k) } return symbol; } + static int dec_vlc_read(oapv_bs_t *bs, int k) { u32 symbol = 0; @@ -268,22 +269,26 @@ static int dec_vlc_read(oapv_bs_t *bs, int k) void oapve_set_frame_header(oapve_ctx_t *ctx, oapv_fh_t *fh) { + oapve_param_t * param = ctx->param; + oapv_mset(fh, 0, sizeof(oapv_fh_t)); - fh->fi.profile_idc = ctx->param->profile_idc; - fh->fi.level_idc = ctx->param->level_idc; - fh->fi.band_idc = ctx->param->band_idc; - fh->fi.frame_width = ctx->param->w; - fh->fi.frame_height = ctx->param->h; + fh->fi.profile_idc = param->profile_idc; + fh->fi.level_idc = param->level_idc; + fh->fi.band_idc = param->band_idc; + fh->fi.frame_width = param->w; + fh->fi.frame_height = param->h; fh->fi.chroma_format_idc = ctx->cfi; fh->fi.bit_depth = ctx->bit_depth; - fh->tile_width_in_mbs = ctx->param->tile_w_mb; - fh->tile_height_in_mbs = ctx->param->tile_h_mb; - if(fh->color_description_present_flag == 0) { - fh->color_primaries = 2; - fh->transfer_characteristics = 2; - fh->matrix_coefficients = 2; - } - fh->use_q_matrix = ctx->param->use_q_matrix; + fh->tile_width_in_mbs = param->tile_w_mb; + fh->tile_height_in_mbs = param->tile_h_mb; + + fh->color_description_present_flag = param->color_description_present_flag; + fh->color_primaries = param->color_primaries; + fh->transfer_characteristics = param->transfer_characteristics; + fh->matrix_coefficients = param->matrix_coefficients; + fh->full_range_flag = param->full_range_flag; + + fh->use_q_matrix = param->use_q_matrix; if(fh->use_q_matrix == 0) { for(int cidx = 0; cidx < ctx->num_comp; cidx++) { for(int y = 0; y < OAPV_BLK_H; y++) { @@ -295,11 +300,10 @@ void oapve_set_frame_header(oapve_ctx_t *ctx, oapv_fh_t *fh) } else { int mod = (1 << OAPV_LOG2_BLK) - 1; - for(int i = 0; i < OAPV_BLK_D; i++) { - fh->q_matrix[Y_C][i >> OAPV_LOG2_BLK][i & mod] = ctx->param->q_matrix_y[i]; - fh->q_matrix[U_C][i >> OAPV_LOG2_BLK][i & mod] = ctx->param->q_matrix_u[i]; - fh->q_matrix[V_C][i >> OAPV_LOG2_BLK][i & mod] = ctx->param->q_matrix_v[i]; - fh->q_matrix[X_C][i >> OAPV_LOG2_BLK][i & mod] = ctx->param->q_matrix_x[i]; + for(int c= 0; c q_matrix[c][i >> OAPV_LOG2_BLK][i & mod] = param->q_matrix[c][i]; + } } } fh->tile_size_present_in_fh_flag = 0; @@ -310,7 +314,7 @@ static int enc_vlc_quantization_matrix(oapv_bs_t *bs, oapve_ctx_t *ctx, oapv_fh_ for(int cidx = 0; cidx < ctx->num_comp; cidx++) { for(int y = 0; y < 8; y++) { for(int x = 0; x < 8; x++) { - oapv_bsw_write(bs, fh->q_matrix[cidx][y][x] - 1, 8); + oapv_bsw_write(bs, fh->q_matrix[cidx][y][x], 8); DUMP_HLS(fh->q_matrix, fh->q_matrix[cidx][y][x]); } } @@ -320,16 +324,16 @@ static int enc_vlc_quantization_matrix(oapv_bs_t *bs, oapve_ctx_t *ctx, oapv_fh_ static int enc_vlc_tile_info(oapv_bs_t *bs, oapve_ctx_t *ctx, oapv_fh_t *fh) { - oapv_bsw_write(bs, fh->tile_width_in_mbs - 1, 28); + oapv_bsw_write(bs, fh->tile_width_in_mbs, 20); DUMP_HLS(fh->tile_width_in_mbs, fh->tile_width_in_mbs); - oapv_bsw_write(bs, fh->tile_height_in_mbs - 1, 28); + oapv_bsw_write(bs, fh->tile_height_in_mbs, 20); DUMP_HLS(fh->tile_height_in_mbs, fh->tile_height_in_mbs); oapv_bsw_write(bs, fh->tile_size_present_in_fh_flag, 1); DUMP_HLS(fh->tile_size_present_in_fh_flag, fh->tile_size_present_in_fh_flag); if(fh->tile_size_present_in_fh_flag) { for(int i = 0; i < ctx->num_tiles; i++) { - oapv_bsw_write(bs, fh->tile_size[i] - 1, 32); - DUMP_HLS(fh->tile_size, fh->tile_size[i] - 1); + oapv_bsw_write(bs, fh->tile_size[i], 32); + DUMP_HLS(fh->tile_size, fh->tile_size[i]); } } @@ -346,10 +350,10 @@ int oapve_vlc_frame_info(oapv_bs_t *bs, oapv_fi_t *fi) DUMP_HLS(fi->band_idc, fi->band_idc); oapv_bsw_write(bs, 0, 5); // reserved_zero_5bits DUMP_HLS(reserved_zero, 0); - oapv_bsw_write(bs, fi->frame_width - 1, 32); - DUMP_HLS(fi->frame_width, fi->frame_width - 1); - oapv_bsw_write(bs, fi->frame_height - 1, 32); - DUMP_HLS(fi->frame_height, fi->frame_height - 1); + oapv_bsw_write(bs, fi->frame_width, 24); + DUMP_HLS(fi->frame_width, fi->frame_width); + oapv_bsw_write(bs, fi->frame_height, 24); + DUMP_HLS(fi->frame_height, fi->frame_height); oapv_bsw_write(bs, fi->chroma_format_idc, 4); DUMP_HLS(fi->chroma_format_idc, fi->chroma_format_idc); oapv_bsw_write(bs, fi->bit_depth - 8, 4); @@ -368,7 +372,7 @@ int oapve_vlc_frame_header(oapv_bs_t *bs, oapve_ctx_t *ctx, oapv_fh_t *fh) oapve_vlc_frame_info(bs, &fh->fi); oapv_bsw_write(bs, 0, 8); // reserved_zero_8bits DUMP_HLS(reserved_zero, 0); - oapv_bsw_write(bs, fh->color_description_present_flag, 1); + oapv_bsw_write1(bs, fh->color_description_present_flag); DUMP_HLS(fh->color_description_present_flag, fh->color_description_present_flag); if(fh->color_description_present_flag) { oapv_bsw_write(bs, fh->color_primaries, 8); @@ -377,8 +381,10 @@ int oapve_vlc_frame_header(oapv_bs_t *bs, oapve_ctx_t *ctx, oapv_fh_t *fh) DUMP_HLS(fh->transfer_characteristics, fh->transfer_characteristics); oapv_bsw_write(bs, fh->matrix_coefficients, 8); DUMP_HLS(fh->matrix_coefficients, fh->matrix_coefficients); + oapv_bsw_write1(bs, fh->full_range_flag); + DUMP_HLS(fh->full_range_flag, fh->full_range_flag); } - oapv_bsw_write(bs, fh->use_q_matrix, 1); + oapv_bsw_write1(bs, fh->use_q_matrix); DUMP_HLS(fh->use_q_matrix, fh->use_q_matrix); if(fh->use_q_matrix) { enc_vlc_quantization_matrix(bs, ctx, fh); @@ -393,8 +399,8 @@ int oapve_vlc_frame_header(oapv_bs_t *bs, oapve_ctx_t *ctx, oapv_fh_t *fh) int oapve_vlc_tile_size(oapv_bs_t *bs, int tile_size) { oapv_assert_rv(bsw_is_align8(bs), OAPV_ERR_MALFORMED_BITSTREAM); - oapv_bsw_write(bs, tile_size - 1, 32); - DUMP_HLS(tile_size, tile_size - 1); + oapv_bsw_write(bs, tile_size, 32); + DUMP_HLS(tile_size, tile_size); return OAPV_OK; } @@ -404,10 +410,10 @@ void oapve_set_tile_header(oapve_ctx_t *ctx, oapv_th_t *th, int tile_idx, int qp for(int c = 0; c < ctx->num_comp; c++) { th->tile_qp[c] = qp; if(c == 1) { - th->tile_qp[c] += ctx->param->qp_cb_offset; + th->tile_qp[c] = oapv_clip3(MIN_QUANT, MAX_QUANT, th->tile_qp[c] + ctx->param->qp_cb_offset); } else if(c == 2) { - th->tile_qp[c] += ctx->param->qp_cr_offset; + th->tile_qp[c] = oapv_clip3(MIN_QUANT, MAX_QUANT, th->tile_qp[c] + ctx->param->qp_cr_offset); } } th->tile_index = tile_idx; @@ -429,8 +435,8 @@ int oapve_vlc_tile_header(oapve_ctx_t *ctx, oapv_bs_t *bs, oapv_th_t *th) oapv_bsw_write(bs, th->tile_index, 16); DUMP_HLS(th->tile_index, th->tile_index); for(int c = 0; c < ctx->num_comp; c++) { - oapv_bsw_write(bs, th->tile_data_size[c] - 1, 32); - DUMP_HLS(th->tile_data_size, th->tile_data_size[c] - 1); + oapv_bsw_write(bs, th->tile_data_size[c], 32); + DUMP_HLS(th->tile_data_size, th->tile_data_size[c]); } for(int c = 0; c < ctx->num_comp; c++) { oapv_bsw_write(bs, th->tile_qp[c], 8); @@ -673,17 +679,13 @@ int oapvd_vlc_frame_info(oapv_bs_t *bs, oapv_fi_t *fi) DUMP_HLS(reserved_zero, reserved_zero); oapv_assert_rv(reserved_zero == 0, OAPV_ERR_MALFORMED_BITSTREAM); - fi->frame_width = oapv_bsr_read(bs, 32); + fi->frame_width = oapv_bsr_read(bs, 24); DUMP_HLS(fi->frame_width, fi->frame_width); - oapv_assert_rv(fi->frame_width > 0 && fi->frame_width < 0xFFFFFFFF, OAPV_ERR_MALFORMED_BITSTREAM); - fi->frame_width += 1; - oapv_assert_rv(fi->frame_width <= INT_MAX, OAPV_ERR_UNSUPPORTED); // frame width greater than 2^31 is unsupported in the current implementation + oapv_assert_rv(fi->frame_width > 0, OAPV_ERR_MALFORMED_BITSTREAM); - fi->frame_height = oapv_bsr_read(bs, 32); + fi->frame_height = oapv_bsr_read(bs, 24); DUMP_HLS(fi->frame_height, fi->frame_height); - oapv_assert_rv(fi->frame_height > 0 && fi->frame_height < 0xFFFFFFFF, OAPV_ERR_MALFORMED_BITSTREAM); - fi->frame_height += 1; - oapv_assert_rv(fi->frame_height <= INT_MAX, OAPV_ERR_UNSUPPORTED); // frame height greater than 2^31 is unsupported in the current implementation + oapv_assert_rv(fi->frame_height > 0, OAPV_ERR_MALFORMED_BITSTREAM); fi->chroma_format_idc = oapv_bsr_read(bs, 4); DUMP_HLS(fi->chroma_format_idc, fi->chroma_format_idc); @@ -744,8 +746,9 @@ static int dec_vlc_q_matrix(oapv_bs_t *bs, oapv_fh_t *fh) for(int cidx = 0; cidx < num_comp; cidx++) { for(int y = 0; y < OAPV_BLK_H; y++) { for(int x = 0; x < OAPV_BLK_W; x++) { - fh->q_matrix[cidx][y][x] = oapv_bsr_read(bs, 8) + 1; + fh->q_matrix[cidx][y][x] = oapv_bsr_read(bs, 8); DUMP_HLS(fh->q_matrix, fh->q_matrix[cidx][y][x]); + oapv_assert_rv(fh->q_matrix[cidx][y][x] > 0, OAPV_ERR_MALFORMED_BITSTREAM); } } } @@ -756,11 +759,13 @@ static int dec_vlc_tile_info(oapv_bs_t *bs, oapv_fh_t *fh) { int pic_w, pic_h, tile_w, tile_h, tile_cols, tile_rows; - fh->tile_width_in_mbs = oapv_bsr_read(bs, 28) + 1; + fh->tile_width_in_mbs = oapv_bsr_read(bs, 20); DUMP_HLS(fh->tile_width_in_mbs, fh->tile_width_in_mbs); + oapv_assert_rv(fh->tile_width_in_mbs > 0, OAPV_ERR_MALFORMED_BITSTREAM); - fh->tile_height_in_mbs = oapv_bsr_read(bs, 28) + 1; + fh->tile_height_in_mbs = oapv_bsr_read(bs, 20); DUMP_HLS(fh->tile_height_in_mbs, fh->tile_height_in_mbs); + oapv_assert_rv(fh->tile_height_in_mbs > 0, OAPV_ERR_MALFORMED_BITSTREAM); /* set various value */ pic_w = ((fh->fi.frame_width + (OAPV_MB_W - 1)) >> OAPV_LOG2_MB_W) << OAPV_LOG2_MB_W; @@ -781,8 +786,7 @@ static int dec_vlc_tile_info(oapv_bs_t *bs, oapv_fh_t *fh) for(int i = 0; i < tile_cols * tile_rows; i++) { fh->tile_size[i] = oapv_bsr_read(bs, 32); DUMP_HLS(fh->tile_size, fh->tile_size[i]); - oapv_assert_rv(fh->tile_size[i] > 0 && fh->tile_size[i] < 0xFFFFFFFF, OAPV_ERR_MALFORMED_BITSTREAM); - fh->tile_size[i] += 1; + oapv_assert_rv(fh->tile_size[i] > 0, OAPV_ERR_MALFORMED_BITSTREAM); } } return OAPV_OK; @@ -798,7 +802,7 @@ int oapvd_vlc_frame_header(oapv_bs_t *bs, oapv_fh_t *fh) DUMP_HLS(reserved_zero, reserved_zero); oapv_assert_rv(reserved_zero == 0, OAPV_ERR_MALFORMED_BITSTREAM); - fh->color_description_present_flag = oapv_bsr_read(bs, 1); + fh->color_description_present_flag = oapv_bsr_read1(bs); DUMP_HLS(fh->color_description_present_flag, fh->color_description_present_flag); if(fh->color_description_present_flag) { fh->color_primaries = oapv_bsr_read(bs, 8); @@ -807,13 +811,17 @@ int oapvd_vlc_frame_header(oapv_bs_t *bs, oapv_fh_t *fh) DUMP_HLS(fh->transfer_characteristics, fh->transfer_characteristics); fh->matrix_coefficients = oapv_bsr_read(bs, 8); DUMP_HLS(fh->matrix_coefficients, fh->matrix_coefficients); + fh->full_range_flag = oapv_bsr_read1(bs); + DUMP_HLS(fh->full_range_flag, fh->full_range_flag); } else { - fh->color_primaries = 2; - fh->transfer_characteristics = 2; - fh->matrix_coefficients = 2; + // default value settings + fh->color_primaries = 2; // unspecified + fh->transfer_characteristics = 2; // unspecified + fh->matrix_coefficients = 2; // unspecified + fh->full_range_flag = 0; // limited range } - fh->use_q_matrix = oapv_bsr_read(bs, 1); + fh->use_q_matrix = oapv_bsr_read1(bs); DUMP_HLS(fh->use_q_matrix, fh->use_q_matrix); if(fh->use_q_matrix) { ret = dec_vlc_q_matrix(bs, fh); @@ -848,8 +856,8 @@ int oapvd_vlc_tile_size(oapv_bs_t *bs, u32 *tile_size) { u32 size = oapv_bsr_read(bs, 32); DUMP_HLS(tile_size, size); - oapv_assert_rv(size > 0 && size < 0xFFFFFFFF, OAPV_ERR_MALFORMED_BITSTREAM); - *tile_size = size + 1; + oapv_assert_rv(size > 0, OAPV_ERR_MALFORMED_BITSTREAM); + *tile_size = size; return OAPV_OK; } @@ -862,8 +870,7 @@ int oapvd_vlc_tile_header(oapv_bs_t *bs, oapvd_ctx_t *ctx, oapv_th_t *th) for(int c = 0; c < ctx->num_comp; c++) { th->tile_data_size[c] = oapv_bsr_read(bs, 32); DUMP_HLS(th->tile_data_size, th->tile_data_size[c]); - oapv_assert_rv(th->tile_data_size[c] > 0 && th->tile_data_size[c] < 0xFFFFFFFF, OAPV_ERR_MALFORMED_BITSTREAM); - th->tile_data_size[c] += 1; + oapv_assert_rv(th->tile_data_size[c] > 0, OAPV_ERR_MALFORMED_BITSTREAM); } for(int c = 0; c < ctx->num_comp; c++) { th->tile_qp[c] = oapv_bsr_read(bs, 8); @@ -1183,7 +1190,7 @@ void oapve_vlc_ac_coeff(oapve_ctx_t *ctx, oapve_core_t *core, oapv_bs_t *bs, s16 } } -int oapvd_vlc_dc_coeff(oapvd_ctx_t *ctx, oapvd_core_t *core, oapv_bs_t *bs, s16 *dc_diff, int c) +int oapvd_vlc_dc_coeff(oapvd_ctx_t *ctx, oapvd_core_t *core, oapv_bs_t *bs, int *dc_diff, int c) { int rice_level = 0; int abs_dc_diff; @@ -1255,6 +1262,7 @@ int oapvd_vlc_ac_coeff(oapvd_ctx_t *ctx, oapvd_core_t *core, oapv_bs_t *bs, s16 else { rice_level = oapv_clip3(OAPV_MIN_AC_LEVEL_CTX, OAPV_MAX_AC_LEVEL_CTX, prev_level >> 2); } + if(rice_level == 0) { if(bs->leftbits == 0) { OAPV_READ_FLUSH(bs, 4); @@ -1320,7 +1328,7 @@ int oapvd_vlc_metadata(oapv_bs_t *bs, u32 pbu_size, oapvm_t mid, int group_id) u32 metadata_size; metadata_size = oapv_bsr_read(bs, 32); DUMP_HLS(metadata_size, metadata_size); - oapv_assert_gv(metadata_size <= (pbu_size - 8), ret, OAPV_ERR_MALFORMED_BITSTREAM, ERR); + oapv_assert_gv(pbu_size >= 8 && metadata_size <= (pbu_size - 8), ret, OAPV_ERR_MALFORMED_BITSTREAM, ERR); u8 *bs_start_pos = bs->cur; u8 *payload_data = NULL; @@ -1330,6 +1338,7 @@ int oapvd_vlc_metadata(oapv_bs_t *bs, u32 pbu_size, oapvm_t mid, int group_id) do { t0 = oapv_bsr_read(bs, 8); DUMP_HLS(payload_type, t0); + oapv_assert_gv(metadata_size > 0, ret, OAPV_ERR_MALFORMED_BITSTREAM, ERR); metadata_size -= 1; if(t0 == 0xFF) { payload_type += 255; @@ -1341,6 +1350,7 @@ int oapvd_vlc_metadata(oapv_bs_t *bs, u32 pbu_size, oapvm_t mid, int group_id) do { t0 = oapv_bsr_read(bs, 8); DUMP_HLS(payload_size, t0); + oapv_assert_gv(metadata_size > 0, ret, OAPV_ERR_MALFORMED_BITSTREAM, ERR); metadata_size -= 1; if(t0 == 0xFF) { payload_size += 255; diff --git a/src/oapv_vlc.h b/src/oapv_vlc.h index b4788c8..01fc1a3 100644 --- a/src/oapv_vlc.h +++ b/src/oapv_vlc.h @@ -62,6 +62,6 @@ int oapvd_vlc_tile_header(oapv_bs_t* bs, oapvd_ctx_t* ctx, oapv_th_t* th); int oapvd_vlc_tile_dummy_data(oapv_bs_t* bs); int oapvd_vlc_metadata(oapv_bs_t* bs, u32 pbu_size, oapvm_t mid, int group_id); int oapvd_vlc_filler(oapv_bs_t* bs, u32 filler_size); -int oapvd_vlc_dc_coeff(oapvd_ctx_t* ctx, oapvd_core_t* core, oapv_bs_t* bs, s16* dc_diff, int c); +int oapvd_vlc_dc_coeff(oapvd_ctx_t* ctx, oapvd_core_t* core, oapv_bs_t* bs, int* dc_diff, int c); int oapvd_vlc_ac_coeff(oapvd_ctx_t* ctx, oapvd_core_t* core, oapv_bs_t* bs, s16* coef, int c); #endif /* _OAPV_VLC_H_ */ diff --git a/src/sse/oapv_sad_sse.c b/src/sse/oapv_sad_sse.c index 8009bbb..de047af 100644 --- a/src/sse/oapv_sad_sse.c +++ b/src/sse/oapv_sad_sse.c @@ -51,7 +51,7 @@ s00a = _mm_add_epi32(s00a, s00); \ s00a = _mm_add_epi32(s00a, s01); -static s64 ssd_16b_sse_8x8(int w, int h, void * src1, void * src2, int s_src1, int s_src2, int bit_depth) +static s64 ssd_16b_sse_8x8(int w, int h, void * src1, void * src2, int s_src1, int s_src2) { s64 ssd; s16 * s1; diff --git a/test/README.md b/test/README.md index 5f8ecca..2af8348 100644 --- a/test/README.md +++ b/test/README.md @@ -5,18 +5,18 @@ | No. | Bitstream Name | Description | Profile   | Level | Band | Frame Rate | Resolution | # of Frame | MD5 sum of bitstream | |-----|----------------|--------------------------------------------------------------|---------------------|-------|------|------------|------------|------------|----------------------------------| -| 1 | tile_A | one-tile per one-picture | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | 0b745f686d3154bc23a8b95b486e2c03 | -| 2 | tile_B | Tile size = min size tile (256x128) | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | c9a475186fc36cfb102638896a5d26be | -| 3 | tile_C | # of Tiles: max num tile (20x20) | 422-10 | 5 | 0 | 30 fps | 7680x4320 | 3 | 64da7cb68ec2161de5650a297e1954bb | -| 4 | tile_D | tile dummy data test | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | c9a475186fc36cfb102638896a5d26be | -| 5 | tile_E | tile_size_present_in_fh_flag=on | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | 2f0dc83c324876b5bf7f02be9c634cfb | -| 6 | qp_A | QP matrix enabled | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | 416800a582b7cbb6a941c4c3866de60f | -| 7 | qp_B | Tile QP variation in a frame | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | 514a2aca526820009a16907ee77c3d45 | -| 8 | qp_C | Set all the QPs in a frame equal to min. QP (=0) | 422-10 | 6 | 2 | 60 fps | 3840x2160 | 3 | bc96b1acf6a2332404f712c1278f6d81 | -| 9 | qp_D | Set all the QPs in a frame equal to max. QP (=51) | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | 90f0e32577e07c30c6b5d75e709e3126 | -| 10 | qp_E | Set different QP betwee luma and chroma | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | d886c4e56086b5f53f4c87dcd62332ab | -| 11 | syn_A | Exercise a synthetic image with QP = 0 and QP = 51 | 422-10 | 4.1 | 2 | 60 fps | 1920x1080 | 2 | a8219946a3e9426935a53d6d55fce987 | -| 12 | syn_B | Exercise a synthetic image with Tile QP variation in Frame | 422-10 | 4.1 | 2 | 60 fps | 1920x1080 | 2 | a8219946a3e9426935a53d6d55fce987 | +| 1 | tile_A | one-tile per one-picture | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | b6e1cef839381b2c90cb9ffcdf537d77 | +| 2 | tile_B | Tile size = min size tile (256x128) | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | 9a0cb5126d705b03a2e7bcdcbacf6fbf | +| 3 | tile_C | # of Tiles: max num tile (20x20) | 422-10 | 5 | 0 | 30 fps | 7680x4320 | 3 | 75363d036965a9dccc90a9ce8d0ae652 | +| 4 | tile_D | tile dummy data test | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | 0394e3ac275e2bc595c07c5290dc9466 | +| 5 | tile_E | tile_size_present_in_fh_flag=on | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | fdf72572b6551bc6a9eed7f80ca0ec0f | +| 6 | qp_A | QP matrix enabled | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | 5ca6d4ea0f65add261b44ed3532a0a73 | +| 7 | qp_B | Tile QP variation in a frame | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | 1b24d4f97c18545b7881002cc642839b | +| 8 | qp_C | Set all the QPs in a frame equal to min. QP (=0) | 422-10 | 6 | 2 | 60 fps | 3840x2160 | 3 | 8c2928ec05eb06d42d6a8bda0ceb7e8d | +| 9 | qp_D | Set all the QPs in a frame equal to max. QP (=51) | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | e5acd3d3a0aa7bd6a45a49af35980512 | +| 10 | qp_E | Set different QP betwee luma and chroma | 422-10 | 4.1 | 2 | 60 fps | 3840x2160 | 3 | e58ea5df35750c0d19cffefde12e78c4 | +| 11 | syn_A | Exercise a synthetic image with QP = 0 and QP = 51 | 422-10 | 4.1 | 2 | 60 fps | 1920x1080 | 2 | e1593a670c62d69718986ff84d1150f3 | +| 12 | syn_B | Exercise a synthetic image with Tile QP variation in Frame | 422-10 | 4.1 | 2 | 60 fps | 1920x1080 | 2 | 9f188e39824829aa05db584034ab1fd0 | ## Test sequence "sequence" folder has the uncompressed video sequence for encoder testing. diff --git a/test/bitstream/qp_A.apv b/test/bitstream/qp_A.apv index 1c26cc6..39b97a6 100644 Binary files a/test/bitstream/qp_A.apv and b/test/bitstream/qp_A.apv differ diff --git a/test/bitstream/qp_B.apv b/test/bitstream/qp_B.apv index 8adb56d..fd3d776 100644 Binary files a/test/bitstream/qp_B.apv and b/test/bitstream/qp_B.apv differ diff --git a/test/bitstream/qp_C.apv b/test/bitstream/qp_C.apv index 3c9908f..33b599e 100644 Binary files a/test/bitstream/qp_C.apv and b/test/bitstream/qp_C.apv differ diff --git a/test/bitstream/qp_D.apv b/test/bitstream/qp_D.apv index 71bf431..6469e60 100644 Binary files a/test/bitstream/qp_D.apv and b/test/bitstream/qp_D.apv differ diff --git a/test/bitstream/qp_E.apv b/test/bitstream/qp_E.apv index 44d904d..24d8091 100644 Binary files a/test/bitstream/qp_E.apv and b/test/bitstream/qp_E.apv differ diff --git a/test/bitstream/syn_A.apv b/test/bitstream/syn_A.apv index 5ed865d..b220d05 100644 Binary files a/test/bitstream/syn_A.apv and b/test/bitstream/syn_A.apv differ diff --git a/test/bitstream/syn_B.apv b/test/bitstream/syn_B.apv index 5ed865d..fb0565c 100644 Binary files a/test/bitstream/syn_B.apv and b/test/bitstream/syn_B.apv differ diff --git a/test/bitstream/tile_A.apv b/test/bitstream/tile_A.apv index 501d45a..074701f 100644 Binary files a/test/bitstream/tile_A.apv and b/test/bitstream/tile_A.apv differ diff --git a/test/bitstream/tile_B.apv b/test/bitstream/tile_B.apv index 9392009..9930bf7 100644 Binary files a/test/bitstream/tile_B.apv and b/test/bitstream/tile_B.apv differ diff --git a/test/bitstream/tile_C.apv b/test/bitstream/tile_C.apv index 1d4e3a3..4bf2f9b 100644 Binary files a/test/bitstream/tile_C.apv and b/test/bitstream/tile_C.apv differ diff --git a/test/bitstream/tile_D.apv b/test/bitstream/tile_D.apv index 9392009..cf0d33e 100644 Binary files a/test/bitstream/tile_D.apv and b/test/bitstream/tile_D.apv differ diff --git a/test/bitstream/tile_E.apv b/test/bitstream/tile_E.apv index 1ea72f4..029262b 100644 Binary files a/test/bitstream/tile_E.apv and b/test/bitstream/tile_E.apv differ diff --git a/version.txt b/version.txt index 717d1cc..bf42c7d 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.1.9.2 +v0.1.10