/**
 *  Copyright 2024 NXP
**/
float3 get_YUV_TO_RGB_temp_UV(float2 uv_src_f, float2 u12_param, float2 y2v1_param){
	float3 uv_temp;
	uv_temp.x = uv_src_f.y * y2v1_param.x;
	uv_temp.y = dot(uv_src_f, u12_param);
	uv_temp.z = uv_src_f.x * y2v1_param.y;
	return uv_temp;
}
__kernel void RGBA_TO_NV12(__global const uchar * src,
	__global uchar *output_y,
	__global uchar *output_uv, int out_uv_offset,
	int src_stride, int dst_stride, int src_bytes, int format)
{
	int x = get_global_id(0)*4;
	int y = get_global_id(1)*2;
	int src_idx = mad24(y, src_stride, x) * src_bytes;
	int src_idx2 = mad24(y+1, src_stride, x) * src_bytes;
	int dst_idx = mad24(y, dst_stride, x);
	int dst_idx2 = mad24(y+1, dst_stride, x);
	int uv_idx = mad24(y/2, dst_stride, x);

	uchar16 src_0 = vload16(0, src + src_idx);
	uchar16 src_1 = vload16(0, src + src_idx2);

	uchar8 rgb_src = src_0.lo;
	uchar8 rgb_src2 = src_0.hi;
	uchar8 rgb_src3 = src_1.lo;
	uchar8 rgb_src4 = src_1.hi;

	float4 rgb_src_0 = convert_float4(rgb_src.lo);
	float4 rgb_src_1 = convert_float4(rgb_src.hi);
	float4 rgb_src_2 = convert_float4(rgb_src2.lo);
	float4 rgb_src_3 = convert_float4(rgb_src2.hi);

	float4 rgb_src_4 = convert_float4(rgb_src3.lo);
	float4 rgb_src_5 = convert_float4(rgb_src3.hi);
	float4 rgb_src_6 = convert_float4(rgb_src4.lo);
	float4 rgb_src_7 = convert_float4(rgb_src4.hi);

	float4 y_param;
	float4 u_param;
	float4 v_param;
	//TODO: use more kernel function
	if(format == 0){
		y_param = (float4)(0.2568f, 0.5041f, 0.0979f, 0.0f);
		u_param = (float4)(-0.1482f, -0.2910f, 0.4392f, 0.0f);
		v_param = (float4)(0.4392f, -0.3678f, -0.0714f, 0.0f);
	}else if(format == 1){
		y_param = (float4)(0.299f, 0.587f, 0.114f, 0.0f);
		u_param = (float4)(-0.1687f, -0.3313f, 0.5f, 0.0f);
		v_param = (float4)(0.5f, -0.4187f, -0.0813f, 0.0f);
	}else if(format == 2){
		y_param = (float4)(0.1826f, 0.6142f, 0.0620f, 0.0f);
		u_param = (float4)(-0.1006f, -0.3386f, 0.4392f, 0.0f);
		v_param = (float4)(0.4392f, -0.3989f, -0.0403f, 0.0f);
	}else if(format == 3){
		y_param = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
		u_param = (float4)(-0.1146f, -0.3854f, 0.5f, 0.0f);
		v_param = (float4)(0.5f, -0.4542f, -0.0458f, 0.0f);
	}

	float8 out_y;
	out_y.s0 = dot(rgb_src_0, y_param);
	out_y.s1 = dot(rgb_src_1, y_param);
	out_y.s2 = dot(rgb_src_2, y_param);
	out_y.s3 = dot(rgb_src_3, y_param);
	out_y.s4 = dot(rgb_src_4, y_param);
	out_y.s5 = dot(rgb_src_5, y_param);
	out_y.s6 = dot(rgb_src_6, y_param);
	out_y.s7 = dot(rgb_src_7, y_param);

	float4 out_uv;
	float4 rgb_uv_av0 = (float4)(rgb_src_0 + rgb_src_1 + rgb_src_4 + rgb_src_5)/4;
	float4 rgb_uv_av1 = (float4)(rgb_src_2 + rgb_src_3 + rgb_src_6 + rgb_src_7)/4;
	out_uv.x = dot(rgb_uv_av0, u_param);
	out_uv.y = dot(rgb_uv_av0, v_param);
	out_uv.z = dot(rgb_uv_av1, u_param);
	out_uv.w = dot(rgb_uv_av1, v_param);

	out_uv += 128;

	if(!(format & 1))
		out_y += 16;

	uchar8 y_data = convert_uchar8_sat_rte(out_y);
	vstore4(y_data.lo, 0, output_y + dst_idx);
	vstore4(y_data.hi, 0, output_y + dst_idx2);
	vstore4(convert_uchar4_sat_rte(out_uv), 0, output_uv + out_uv_offset + uv_idx);
}
__kernel void NV12_TO_RGB(__global const uchar * in_y, 
	__global const uchar *in_uv, int in_uv_offset,
	__global uchar *output,
	 int src_stride, int dst_stride, int tar_bytes, int format)
{
	int x = get_global_id(0)*4;
	int y = get_global_id(1)*2;
	int y_idx = mad24(y, src_stride, x);
	int y_idx2 = mad24(y+1, src_stride, x);
	int uv_idx = mad24(y/2, src_stride, x);
	int dst_idx = mad24(y, dst_stride, x) * tar_bytes;
	int dst_idx2 = mad24(y+1, dst_stride, x) * tar_bytes;

	uchar4 y_src;
	uchar4 y_src2;
	y_src = vload4(0, in_y + y_idx);
	y_src2 = vload4(0, in_y + y_idx2);
	uchar4 uv_src = vload4(0, in_uv + in_uv_offset + uv_idx);
	float4 uv_src_f = convert_float4(uv_src) - 128;
	float2 uv_src_f1 = uv_src_f.lo;
	float2 uv_src_f2 = uv_src_f.hi;

	float4 y_src_f = convert_float4(y_src);
	float4 y_src_f2 = convert_float4(y_src2);

	if(!(format & 1)){
		y_src_f -= 16;
		y_src_f2 -= 16;
	}

	float3 x_param;
	float2 u12_param;
	float2 y2v1_param;

	//TODO: use more kernel function
	if(format == 0){
		x_param = (float3)(1.164384f);//y0,u0,v0
		y2v1_param = (float2)(1.596027f, 2.017232f);//y2,v1
		u12_param = (float2)(-0.391762f, -0.812968f);//u1,u2
	}else if(format == 1){
		x_param = (float3)(1.0f);
		y2v1_param = (float2)(1.402f, 1.772f);
		u12_param = (float2)(-0.344136f, -0.714136f);
	}else if(format == 2){
		x_param = (float3)(1.164384f);//y0,u0,v0
		y2v1_param = (float2)(1.792741f, 2.112402f);
		u12_param = (float2)(-0.213249f, -0.532909f);
	}else if(format == 3){
		x_param = (float3)(1.0f);
		y2v1_param = (float2)(1.5748f, 1.8556f);
		u12_param = (float2)(-0.187324f, -0.468124f);
	}

	float3 uv_temp = get_YUV_TO_RGB_temp_UV(uv_src_f1, u12_param, y2v1_param);
	float3 uv_temp2 = get_YUV_TO_RGB_temp_UV(uv_src_f2, u12_param, y2v1_param);

	float3 rgb_0 = x_param * y_src_f.x + uv_temp;
	float3 rgb_1 = x_param * y_src_f.y + uv_temp;
	float3 rgb_2 = x_param * y_src_f.z + uv_temp2;
	float3 rgb_3 = x_param * y_src_f.w + uv_temp2;
	float3 rgb_4 = x_param * y_src_f2.x + uv_temp;
	float3 rgb_5 = x_param * y_src_f2.y + uv_temp;
	float3 rgb_6 = x_param * y_src_f2.z + uv_temp2;
	float3 rgb_7 = x_param * y_src_f2.w + uv_temp2;

	uchar3 dst0 = convert_uchar3_sat_rte(rgb_0);
	uchar3 dst1 = convert_uchar3_sat_rte(rgb_1);
	uchar3 dst2 = convert_uchar3_sat_rte(rgb_2);
	uchar3 dst3 = convert_uchar3_sat_rte(rgb_3);
	uchar3 dst4 = convert_uchar3_sat_rte(rgb_4);
	uchar3 dst5 = convert_uchar3_sat_rte(rgb_5);
	uchar3 dst6 = convert_uchar3_sat_rte(rgb_6);
	uchar3 dst7 = convert_uchar3_sat_rte(rgb_7);

	vstore3(dst0, 0, output + dst_idx);
	vstore3(dst1, 0, output + dst_idx + tar_bytes);
	vstore3(dst2, 0, output + dst_idx + tar_bytes*2);
	vstore3(dst3, 0, output + dst_idx + tar_bytes*3);

	vstore3(dst4, 0, output + dst_idx2);
	vstore3(dst5, 0, output + dst_idx2 + tar_bytes);
	vstore3(dst6, 0, output + dst_idx2 + tar_bytes*2);
	vstore3(dst7, 0, output + dst_idx2 + tar_bytes*3);
}
__kernel void I420_TO_RGB(__global const uchar * in_y,
	__global const uchar *in_u, int in_u_offset,
	__global const uchar *in_v, int in_v_offset,
	__global uchar *output,
	 int src_stride, int dst_stride, int tar_bytes, int format)
{
	int x = get_global_id(0) << 2;
	int y = get_global_id(1) << 1;
	int y_idx = mad24(y, src_stride, x);
	int y_idx2 = mad24(y+1, src_stride, x);
	int uv_idx = mad24((y >> 1), (src_stride >> 1), (x >> 1));
	int dst_idx = mad24(y, dst_stride, x) * tar_bytes;
	int dst_idx2 = mad24(y+1, dst_stride, x) * tar_bytes;

	uchar4 y_src;
	uchar4 y_src2;
	y_src = vload4(0, in_y + y_idx);
	y_src2 = vload4(0, in_y + y_idx2);
	uchar2 u_src = vload2(0, in_u + in_u_offset + uv_idx);
	uchar2 v_src = vload2(0, in_v + in_v_offset + uv_idx);
	uchar4 uv_src;
	uv_src.s02 = u_src;
	uv_src.s13 = v_src;
	float4 uv_src_f = convert_float4(uv_src) - 128;
	float2 uv_src_f1 = uv_src_f.lo;
	float2 uv_src_f2 = uv_src_f.hi;

	float4 y_src_f = convert_float4(y_src);
	float4 y_src_f2 = convert_float4(y_src2);

	if(!(format & 1)){
		y_src_f -= 16;
		y_src_f2 -= 16;
	}

	float3 x_param;
	float2 u12_param;
	float2 y2v1_param;

	if(format == 0){
		x_param = (float3)(1.164384f);//y0,u0,v0
		y2v1_param = (float2)(1.596027f, 2.017232f);//y2,v1
		u12_param = (float2)(-0.391762f, -0.812968f);//u1,u2
	}else if(format == 1){
		x_param = (float3)(1.0f);
		y2v1_param = (float2)(1.402f, 1.772f);
		u12_param = (float2)(-0.344136f, -0.714136f);
	}else if(format == 2){
		x_param = (float3)(1.164384f);//y0,u0,v0
		y2v1_param = (float2)(1.792741f, 2.112402f);
		u12_param = (float2)(-0.213249f, -0.532909f);
	}else if(format == 3){
		x_param = (float3)(1.0f);
		y2v1_param = (float2)(1.5748f, 1.8556f);
		u12_param = (float2)(-0.187324f, -0.468124f);
	}

	float3 uv_temp = get_YUV_TO_RGB_temp_UV(uv_src_f1, u12_param, y2v1_param);
	float3 uv_temp2 = get_YUV_TO_RGB_temp_UV(uv_src_f2, u12_param, y2v1_param);

	float3 rgb_0 = x_param * y_src_f.x + uv_temp;
	float3 rgb_1 = x_param * y_src_f.y + uv_temp;
	float3 rgb_2 = x_param * y_src_f.z + uv_temp2;
	float3 rgb_3 = x_param * y_src_f.w + uv_temp2;
	float3 rgb_4 = x_param * y_src_f2.x + uv_temp;
	float3 rgb_5 = x_param * y_src_f2.y + uv_temp;
	float3 rgb_6 = x_param * y_src_f2.z + uv_temp2;
	float3 rgb_7 = x_param * y_src_f2.w + uv_temp2;

	uchar3 dst0 = convert_uchar3_sat_rte(rgb_0);
	uchar3 dst1 = convert_uchar3_sat_rte(rgb_1);
	uchar3 dst2 = convert_uchar3_sat_rte(rgb_2);
	uchar3 dst3 = convert_uchar3_sat_rte(rgb_3);
	uchar3 dst4 = convert_uchar3_sat_rte(rgb_4);
	uchar3 dst5 = convert_uchar3_sat_rte(rgb_5);
	uchar3 dst6 = convert_uchar3_sat_rte(rgb_6);
	uchar3 dst7 = convert_uchar3_sat_rte(rgb_7);

	vstore3(dst0, 0, output + dst_idx);
	vstore3(dst1, 0, output + dst_idx + tar_bytes);
	vstore3(dst2, 0, output + dst_idx + tar_bytes*2);
	vstore3(dst3, 0, output + dst_idx + tar_bytes*3);

	vstore3(dst4, 0, output + dst_idx2);
	vstore3(dst5, 0, output + dst_idx2 + tar_bytes);
	vstore3(dst6, 0, output + dst_idx2 + tar_bytes*2);
	vstore3(dst7, 0, output + dst_idx2 + tar_bytes*3);
}
__kernel void YUYV_TO_RGB(__global const uchar * in_y,
	 __global uchar *output,
	 int src_stride, int dst_stride, int tar_bytes, int format)
{
	int x = get_global_id(0)*8;
	int y = get_global_id(1)*2;
	int src_idx = mad24(y, src_stride, x) * 2;
	int src_idx2 = mad24(y+1, src_stride, x) * 2;
	int dst_idx = mad24(y, dst_stride, x) * tar_bytes;
	int dst_idx2 = mad24(y+1, dst_stride, x) * tar_bytes;

	uchar16 src = vload16(0, in_y + src_idx);
	uchar16 src2 = vload16(0, in_y + src_idx2);

	float8 f_uv_src_0 = convert_float8(src.odd) - 128;
	float8 f_uv_src_1 = convert_float8(src2.odd) - 128;

	float8 f_y_src_0 = convert_float8(src.even);
	float8 f_y_src_1 = convert_float8(src2.even);

	if(!(format & 1)){
		f_y_src_0 -= 16;
		f_y_src_1 -= 16;
	}

	float3 x_param;
	float2 u12_param;
	float2 y2v1_param;

	if(format == 0){
		x_param = (float3)(1.164384f);//y0,u0,v0
		y2v1_param = (float2)(1.596027f, 2.017232f);//y2,v1
		u12_param = (float2)(-0.391762f, -0.812968f);//u1,u2
	}else if(format == 1){
		x_param = (float3)(1.0f);
		y2v1_param = (float2)(1.402f, 1.772f);
		u12_param = (float2)(-0.344136f, -0.714136f);
	}else if(format == 2){
		x_param = (float3)(1.164384f);//y0,u0,v0
		y2v1_param = (float2)(1.792741f, 2.112402f);
		u12_param = (float2)(-0.213249f, -0.532909f);
	}else if(format == 3){
		x_param = (float3)(1.0f);
		y2v1_param = (float2)(1.5748f, 1.8556f);
		u12_param = (float2)(-0.187324f, -0.468124f);
	}

	float3 uv_temp = get_YUV_TO_RGB_temp_UV(f_uv_src_0.s01, u12_param, y2v1_param);
	float3 uv_temp2 = get_YUV_TO_RGB_temp_UV(f_uv_src_0.s23, u12_param, y2v1_param);
	float3 uv_temp3 = get_YUV_TO_RGB_temp_UV(f_uv_src_0.s45, u12_param, y2v1_param);
	float3 uv_temp4 = get_YUV_TO_RGB_temp_UV(f_uv_src_0.s67, u12_param, y2v1_param);

	float3 uv_temp5 = get_YUV_TO_RGB_temp_UV(f_uv_src_1.s01, u12_param, y2v1_param);
	float3 uv_temp6 = get_YUV_TO_RGB_temp_UV(f_uv_src_1.s23, u12_param, y2v1_param);
	float3 uv_temp7 = get_YUV_TO_RGB_temp_UV(f_uv_src_1.s45, u12_param, y2v1_param);
	float3 uv_temp8 = get_YUV_TO_RGB_temp_UV(f_uv_src_1.s67, u12_param, y2v1_param);

	float3 f_rgb_0_0 = x_param * f_y_src_0.s0 + uv_temp;
	float3 f_rgb_0_1 = x_param * f_y_src_0.s1 + uv_temp;
	float3 f_rgb_0_2 = x_param * f_y_src_0.s2 + uv_temp2;
	float3 f_rgb_0_3 = x_param * f_y_src_0.s3 + uv_temp2;
	float3 f_rgb_0_4 = x_param * f_y_src_0.s4 + uv_temp3;
	float3 f_rgb_0_5 = x_param * f_y_src_0.s5 + uv_temp3;
	float3 f_rgb_0_6 = x_param * f_y_src_0.s6 + uv_temp4;
	float3 f_rgb_0_7 = x_param * f_y_src_0.s7 + uv_temp4;

	float3 f_rgb_1_0 = x_param * f_y_src_1.s0 + uv_temp5;
	float3 f_rgb_1_1 = x_param * f_y_src_1.s1 + uv_temp5;
	float3 f_rgb_1_2 = x_param * f_y_src_1.s2 + uv_temp6;
	float3 f_rgb_1_3 = x_param * f_y_src_1.s3 + uv_temp6;
	float3 f_rgb_1_4 = x_param * f_y_src_1.s4 + uv_temp7;
	float3 f_rgb_1_5 = x_param * f_y_src_1.s5 + uv_temp7;
	float3 f_rgb_1_6 = x_param * f_y_src_1.s6 + uv_temp8;
	float3 f_rgb_1_7 = x_param * f_y_src_1.s7 + uv_temp8;

	uchar16 dst0_0;
	uchar8 dst0_1;

	uchar16 dst1_0;
	uchar8 dst1_1;

	dst0_0.s012 = convert_uchar3_sat_rte(f_rgb_0_0);
	dst0_0.s345 = convert_uchar3_sat_rte(f_rgb_0_1);
	dst0_0.s678 = convert_uchar3_sat_rte(f_rgb_0_2);
	dst0_0.s9ab = convert_uchar3_sat_rte(f_rgb_0_3);
	dst0_0.scde = convert_uchar3_sat_rte(f_rgb_0_4);
	uchar3 dst_0_tmp_5 = convert_uchar3_sat_rte(f_rgb_0_5);
	dst0_0.sf = dst_0_tmp_5.x;
	dst0_1.s01 = dst_0_tmp_5.yz;
	dst0_1.s234 = convert_uchar3_sat_rte(f_rgb_0_6);
	dst0_1.s567 = convert_uchar3_sat_rte(f_rgb_0_7);

	dst1_0.s012 = convert_uchar3_sat_rte(f_rgb_1_0);
	dst1_0.s345 = convert_uchar3_sat_rte(f_rgb_1_1);
	dst1_0.s678 = convert_uchar3_sat_rte(f_rgb_1_2);
	dst1_0.s9ab = convert_uchar3_sat_rte(f_rgb_1_3);
	dst1_0.scde = convert_uchar3_sat_rte(f_rgb_1_4);
	uchar3 dst_1_tmp_5 = convert_uchar3_sat_rte(f_rgb_1_5);
	dst1_0.sf = dst_1_tmp_5.x;
	dst1_1.s01 = dst_1_tmp_5.yz;
	dst1_1.s234 = convert_uchar3_sat_rte(f_rgb_1_6);
	dst1_1.s567 = convert_uchar3_sat_rte(f_rgb_1_7);

	vstore16(dst0_0, 0, output + dst_idx);
	vstore8(dst0_1, 0, output + dst_idx + 16);

	vstore16(dst1_0, 0, output + dst_idx2);
	vstore8(dst1_1, 0, output + dst_idx2 + 16);
}
uchar3 get_scale_data_from_4_coordinates(uchar3 data_0, uchar3 data_1, uchar3 data_2, uchar3 data_3, int format, float a, float b)
{
	float3 f_data = convert_float3(data_0);
	float3 f_data_1 = convert_float3(data_1);
	float3 f_data_2 = convert_float3(data_2);
	float3 f_data_3 = convert_float3(data_3);

	float3 data_tar = (1 - a) * (1 - b) * f_data + a * (1 - b) * f_data_1 +
			(1 - a) * b * f_data_2 + a * b * f_data_3;

	if(!(format & 1)){
		data_tar.x -= 16;
	}
	data_tar.yz -= 128;

	float3 x_param;
	float2 u12_param;
	float2 y2v1_param;

	if(format == 0){
		x_param = (float3)(1.164384f);//y0,u0,v0
		y2v1_param = (float2)(1.596027f, 2.017232f);//y2,v1
		u12_param = (float2)(-0.391762f, -0.812968f);//u1,u2
	}else if(format == 1){
		x_param = (float3)(1.0f);
		y2v1_param = (float2)(1.402f, 1.772f);
		u12_param = (float2)(-0.344136f, -0.714136f);
	}else if(format == 2){
		x_param = (float3)(1.164384f);//y0,u0,v0
		y2v1_param = (float2)(1.792741f, 2.112402f);
		u12_param = (float2)(-0.213249f, -0.532909f);
	}else if(format == 3){
		x_param = (float3)(1.0f);
		y2v1_param = (float2)(1.5748f, 1.8556f);
		u12_param = (float2)(-0.187324f, -0.468124f);
	}

	float3 uv_temp = get_YUV_TO_RGB_temp_UV(data_tar.yz, u12_param, y2v1_param);
	float3 f_rgb = x_param * data_tar.x + uv_temp;
	return convert_uchar3_sat_rte(f_rgb);
}
__kernel void NV12_TO_RGB_SCALE(__global const uchar * src_y,
	__global const uchar * src_uv, int src_uv_offset,
	__global uchar *dst_y,
	const BufInfo src_info, const BufInfo dst_info, int format)
{
	int x = get_global_id(0);
	int y = get_global_id(1);

	float2 coordinate = {(float)src_info.width / dst_info.width * x + src_info.left, (float)src_info.height / dst_info.height * y + src_info.top};
	coordinate -= 0.5f;

	int i0 = clamp((int)floor(coordinate.x), src_info.left, src_info.right - 1);
	int j0 = clamp((int)floor(coordinate.y), src_info.top, src_info.bottom - 1);
	int i1 = clamp((int)floor(coordinate.x) + 1, src_info.left, src_info.right - 1);
	int j1 = clamp((int)floor(coordinate.y) + 1, src_info.top, src_info.bottom - 1);

	float a = coordinate.x - floor(coordinate.x);
	float b = coordinate.y - floor(coordinate.y); //x - floor(x)

	int src_idx_0_y = mad24(j0, src_info.stride, i0);
	int src_idx_0_uv = mad24(j0/2, src_info.stride, i0);
	int src_idx_1_y = mad24(j0, src_info.stride, i1);
	int src_idx_1_uv = mad24(j0/2, src_info.stride, i1);
	int src_idx_2_y = mad24(j1, src_info.stride, i0);
	int src_idx_2_uv = mad24(j1/2, src_info.stride, i0);
	int src_idx_3_y = mad24(j1, src_info.stride, i1);
	int src_idx_3_uv = mad24(j1/2, src_info.stride, i1);

	if(src_idx_0_uv & 1)
		src_idx_0_uv --;
	if(src_idx_1_uv & 1)
		src_idx_1_uv --;
	if(src_idx_2_uv & 1)
		src_idx_2_uv --;
	if(src_idx_3_uv & 1)
		src_idx_3_uv --;

	uchar3 data_0;
	uchar3 data_1;
	uchar3 data_2;
	uchar3 data_3;

	data_0.x = src_y[src_idx_0_y];
	data_1.x = src_y[src_idx_1_y];
	data_2.x = src_y[src_idx_2_y];
	data_3.x = src_y[src_idx_3_y];

	data_0.yz = vload2(0, src_uv + src_uv_offset + src_idx_0_uv);
	data_1.yz = vload2(0, src_uv + src_uv_offset + src_idx_1_uv);
	data_2.yz = vload2(0, src_uv + src_uv_offset + src_idx_2_uv);
	data_3.yz = vload2(0, src_uv + src_uv_offset + src_idx_3_uv);

	uchar3 dst_bytes = get_scale_data_from_4_coordinates(data_0, data_1, data_2, data_3, format, a, b);
	int dst_idx = mad24(y + dst_info.top, dst_info.stride, x + dst_info.left) * 3;
	vstore3(dst_bytes, 0, dst_y + dst_idx);
}
__kernel void I420_TO_RGB_SCALE(__global const uchar * src_y,
	__global const uchar *src_u, int in_u_offset,
	__global const uchar *src_v, int in_v_offset,
	__global uchar *dst,
	const BufInfo src_info, const BufInfo dst_info, int format)
{
	int x = get_global_id(0);
	int y = get_global_id(1);

	float2 coordinate = {(float)src_info.width / dst_info.width * x + src_info.left, (float)src_info.height / dst_info.height * y + src_info.top};
	coordinate -= 0.5f;

	int i0 = clamp((int)floor(coordinate.x), src_info.left, src_info.right - 1);
	int j0 = clamp((int)floor(coordinate.y), src_info.top, src_info.bottom - 1);
	int i1 = clamp((int)floor(coordinate.x) + 1, src_info.left, src_info.right - 1);
	int j1 = clamp((int)floor(coordinate.y) + 1, src_info.top, src_info.bottom - 1);

	float a = coordinate.x - floor(coordinate.x);
	float b = coordinate.y - floor(coordinate.y); //x - floor(x)

	int src_idx_0_y = mad24(j0, src_info.stride, i0);
	int src_idx_0_uv = mad24(j0/2, src_info.stride/2, i0/2);
	int src_idx_1_y = mad24(j0, src_info.stride, i1);
	int src_idx_1_uv = mad24(j0/2, src_info.stride/2, i1/2);
	int src_idx_2_y = mad24(j1, src_info.stride, i0);
	int src_idx_2_uv = mad24(j1/2, src_info.stride/2, i0/2);
	int src_idx_3_y = mad24(j1, src_info.stride, i1);
	int src_idx_3_uv = mad24(j1/2, src_info.stride/2, i1/2);

	uchar3 data_0;
	data_0.x = src_y[src_idx_0_y];
	data_0.y = src_u[in_u_offset + src_idx_0_uv];
	data_0.z = src_v[in_v_offset + src_idx_0_uv];

	uchar3 data_1;
	data_1.x = src_y[src_idx_1_y];
	data_1.y = src_u[in_u_offset + src_idx_1_uv];
	data_1.z = src_v[in_v_offset + src_idx_1_uv];

	uchar3 data_2;
	data_2.x = src_y[src_idx_2_y];
	data_2.y = src_u[in_u_offset + src_idx_2_uv];
	data_2.z = src_v[in_v_offset + src_idx_2_uv];

	uchar3 data_3;
	data_3.x = src_y[src_idx_3_y];
	data_3.y = src_u[in_u_offset + src_idx_3_uv];
	data_3.z = src_v[in_v_offset + src_idx_3_uv];

	uchar3 dst_bytes = get_scale_data_from_4_coordinates(data_0, data_1, data_2, data_3, format, a, b);
	int dst_idx = mad24(y + dst_info.top, dst_info.stride, x + dst_info.left) * 3;
	vstore3(dst_bytes, 0, dst + dst_idx);
}
__kernel void YUYV_TO_RGB_SCALE(__global const uchar * src,
	__global uchar *dst_y,
	const BufInfo src_info, const BufInfo dst_info, int format)
{
	int x = get_global_id(0);
	int y = get_global_id(1);

	float2 coordinate = {(float)src_info.width / dst_info.width * x + src_info.left, (float)src_info.height / dst_info.height * y + src_info.top};
	coordinate -= 0.5f;

	int i0 = clamp((int)floor(coordinate.x), src_info.left, src_info.right - 1);
	int j0 = clamp((int)floor(coordinate.y), src_info.top, src_info.bottom - 1);
	int i1 = clamp((int)floor(coordinate.x) + 1, src_info.left, src_info.right - 1);
	int j1 = clamp((int)floor(coordinate.y) + 1, src_info.top, src_info.bottom - 1);

	float a = coordinate.x - floor(coordinate.x);
	float b = coordinate.y - floor(coordinate.y); //x - floor(x)

	int src_idx_0_y = mad24(j0, src_info.stride, i0) << 1;
	int src_idx_1_y = mad24(j0, src_info.stride, i1) << 1;
	int src_idx_2_y = mad24(j1, src_info.stride, i0) << 1;
	int src_idx_3_y = mad24(j1, src_info.stride, i1) << 1;

	uchar3 data_0 = read_yuv_from_yuyv(src, src_idx_0_y);
	uchar3 data_1 = read_yuv_from_yuyv(src, src_idx_1_y);
	uchar3 data_2 = read_yuv_from_yuyv(src, src_idx_2_y);
	uchar3 data_3 = read_yuv_from_yuyv(src, src_idx_3_y);

	uchar3 dst_bytes = get_scale_data_from_4_coordinates(data_0, data_1, data_2, data_3, format, a, b);
	int dst_idx = mad24(y + dst_info.top, dst_info.stride, x + dst_info.left) * 3;
	vstore3(dst_bytes, 0, dst_y + dst_idx);
}