//designed by RuanShengQiang. 
const sampler_t samplerBG = CLK_NORMALIZED_COORDS_TRUE| CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 
	const sampler_t samplerOVL = CLK_NORMALIZED_COORDS_TRUE| CLK_ADDRESS_CLAMP_TO_EDGE |CLK_FILTER_LINEAR;
	const sampler_t samplerOVLNN = CLK_NORMALIZED_COORDS_FALSE| CLK_ADDRESS_CLAMP_TO_EDGE |CLK_FILTER_LINEAR;
	const sampler_t samplerOVLN = CLK_NORMALIZED_COORDS_FALSE| CLK_ADDRESS_CLAMP_TO_EDGE |CLK_FILTER_NEAREST;
float4 updateCoefficients_getValue (float4 *tmpPixels,float x,float y){
    float4* p = tmpPixels;
    float4 a00 = p[1*4+1];
    float4 a01 = -0.5f * p[1*4+0]+ 0.5f * p[1*4+2];
    float4 a02 = p[1*4+0]- 2.5f * p[1*4+1]+ 2.0f * p[1*4+2]- 0.5f * p[1*4+3];
    float4 a03 = -0.5f * p[1*4+0]+ 1.5f * p[1*4+1]- 1.5f * p[1*4+2]+ 0.5f * p[1*4+3];

    float4 a10 = -0.5f * p[0*4+1]+ 0.5f * p[2*4+1];
    float4 a11 = 0.25f * p[0*4+0]- 0.25f * p[0*4+2]- 0.25f * p[2*4+0]+ 0.25f * p[2*4+2];
    float4 a12 = -0.5f * p[0*4+0]+ 1.25f * p[0*4+1]- p[0*4+2]+ 0.25f * p[0*4+3]+ 0.5f * p[2*4+0]- 1.25f * p[2*4+1]+ p[2*4+2]- 0.25f * p[2*4+3];
    float4 a13 = 0.25f * p[0*4+0]- 0.75f * p[0*4+1] + 0.75f * p[0*4+2]- 0.25f * p[0*4+3]- 0.25f * p[2*4+0]+ 0.75f * p[2*4+1]- 0.75f * p[2*4+2]+ 0.25f * p[2*4+3];

    float4 a20 = p[0*4+1]- 2.5f * p[1*4+1]+ 2.0f * p[2*4+1]- 0.5f * p[3*4+1];
    float4 a21 = -0.5f * p[0*4+0]+ 0.5f * p[0*4+2]+ 1.25f * p[1*4+0]- 1.25f * p[1*4+2]- p[2*4+0]+ p[2*4+2]+ 0.25f * p[3*4+0]- 0.25f * p[3*4+2];
    float4 a22 = p[0*4+0]- 2.5f * p[0*4+1]+ 2.0f * p[0*4+2]- 0.5f * p[0*4+3]- 2.5f * p[1*4+0] + 6.25f * p[1*4+1]- 5.0f * p[1*4+2]+ 1.25f * p[1*4+3]+ 2.0f * p[2*4+0]- 5.0f * p[2*4+1] + 4.0f * p[2*4+2]- p[2*4+3]- 0.5f * p[3*4+0]+ 1.25f * p[3*4+1]- p[3*4+2]+ 0.25f * p[3*4+3];
    float4 a23 = -0.5f * p[0*4+0]+ 1.5f * p[0*4+1]- 1.5f * p[0*4+2]+ 0.5f * p[0*4+3]+ 1.25f * p[1*4+0]- 3.75f * p[1*4+1]+ 3.75f * p[1*4+2]- 1.25f * p[1*4+3]- p[2*4+0]+ 3.0f * p[2*4+1]- 3.0f * p[2*4+2]+ p[2*4+3]+ 0.25f * p[3*4+0]- 0.75f * p[3*4+1]+ 0.75f * p[3*4+2]- 0.25f * p[3*4+3];

    float4 a30 = -0.5f * p[0*4+1]+ 1.5f * p[1*4+1]- 1.5f * p[2*4+1]+ 0.5f * p[3*4+1];
    float4 a31 = 0.25f * p[0*4+0]- 0.25f * p[0*4+2]- 0.75f * p[1*4+0]+ 0.75f * p[1*4+2]+ 0.75f * p[2*4+0] - 0.75f * p[2*4+2]- 0.25f * p[3*4+0]+ 0.25f * p[3*4+2];
    float4 a32 = -0.5f * p[0*4+0]+ 1.25f * p[0*4+1]- p[0*4+2]+ 0.25f * p[0*4+3]+ 1.5f * p[1*4+0]- 3.75f * p[1*4+1]+ 3.0f * p[1*4+2]- 0.75f * p[1*4+3]- 1.5f * p[2*4+0]+ 3.75f * p[2*4+1]- 3.0f * p[2*4+2]+ 0.75f * p[2*4+3]+ 0.5f * p[3*4+0]- 1.25f * p[3*4+1]+ p[3*4+2]- 0.25f * p[3*4+3];
    float4 a33 = 0.25f * p[0*4+0]- 0.75f * p[0*4+1]+ 0.75f * p[0*4+2]- 0.25f * p[0*4+3]- 0.75f * p[1*4+0]+ 2.25f * p[1*4+1]- 2.25f * p[1*4+2]+ 0.75f * p[1*4+3]+ 0.75f * p[2*4+0]- 2.25f * p[2*4+1]+ 2.25f * p[2*4+2]- 0.75f * p[2*4+3]- 0.25f * p[3*4+0]+ 0.75f * p[3*4+1]- 0.75f * p[3*4+2]+ 0.25f * p[3*4+3];
		
		
	float x2 = x * x;
    float x3 = x2 * x;
    float y2 = y * y;
    float y3 = y2 * y;

    return (a00 + a01 * y + a02 * y2 + a03 * y3) +
        (a10 + a11 * y + a12 * y2 + a13 * y3) * x +
        (a20 + a21 * y + a22 * y2 + a23 * y3) * x2 +
        (a30 + a31 * y + a32 * y2 + a33 * y3) * x3;	
		
}
float4 getRGBAValue (image2d_t src_data, float srcWidth, float srcHeight, int row, int col,float src_offsetX,float src_offsetY) {
    float newRow = row;
    float newCol = col;

    if (newRow >= srcHeight) {
        newRow = srcHeight - 1;
    } else if (newRow < 0) {
        newRow = 0;
    }

    if (newCol >= srcWidth) {
        newCol = srcWidth - 1;
    } else if (newCol < 0) {
        newCol = 0;
    }
	float2 tc=(float2)(newCol,newRow)+(float2)(src_offsetX,src_offsetY);
	float4 color = read_imagef(src_data, samplerOVLN, tc);
	return color;
    
}
float4 getRGBAValue_normalizeCoord (image2d_t src_data, int row, int col,float src_offsetX,float src_offsetY,float2 resolution,float matt) {
    float newRow = row;
    float newCol = col;
	float srcHeight=resolution.y;
	float srcWidth=resolution.x;
    if (newRow >= srcHeight) {
        newRow = srcHeight - 1;
    } else if (newRow < 0) {
        newRow = 0;
    }

    if (newCol >= srcWidth) {
        newCol = srcWidth - 1;
    } else if (newCol < 0) {
        newCol = 0;
    }
	
	float2 tc=(float2)(newCol,newRow)+(float2)(src_offsetX,src_offsetY);
	tc.x=tc.x/resolution.x;
	tc.y=tc.y/resolution.y;
	//float4 color = read_imagef(src_data, samplerOVLN, tc);
	float4 color = read_imagef(src_data, samplerBG, tc)*matt;
	return color;
    
}
float4 getRGBAValue_normalizeCoord1(image2d_t src_data, float srcWidth, float srcHeight, int row, int col,float src_offsetX,float src_offsetY,float2 resolution,float matt) {
    float newRow = row;
    float newCol = col;

    // if (newRow >= srcHeight) {
        // newRow = srcHeight - 1;
    // } else if (newRow < 0) {
        // newRow = 0;
    // }

    // if (newCol >= srcWidth) {
        // newCol = srcWidth - 1;
    // } else if (newCol < 0) {
        // newCol = 0;
    // }
	if (newRow >= resolution.y) {
        newRow = resolution.y - 1;
    } else if (newRow < 0) {
        newRow = 0;
    }

    if (newCol >= resolution.x) {
        newCol = resolution.x - 1;
    } else if (newCol < 0) {
        newCol = 0;
    }
	float2 tc=(float2)(newCol,newRow)+(float2)(src_offsetX,src_offsetY);
	tc.x=tc.x/resolution.x;
	tc.y=tc.y/resolution.y;
	//float4 color = read_imagef(src_data, samplerOVLN, tc);
	float4 color = read_imagef(src_data, samplerBG, tc)*matt;
	return color;
    
}
float4 bicubic(image2d_t src_data,float dstColIndex, float dstRowIndex,float width,float height,float scaleW,float scaleH,float src_offsetX,float src_offsetY){
	float srcCol = min(width - 1, (dstColIndex+0.5f) / scaleW-0.5f);
	float srcRow = min(height - 1, (dstRowIndex+0.5f) / scaleH-0.5f);
	int intCol = floor(srcCol);
	int intRow = floor(srcRow);
	// calculate u v
	float u = srcCol - intCol;
	float v = srcRow - intRow;
	
	float4 tmppixels[]={
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
	};
	 
	// 16 neiber
	for (int m = -1; m <= 2; m += 1) {
		for (int n = -1; n <= 2; n += 1) {
			float4 value = getRGBAValue(src_data,width,height,intRow+m,intCol + n,src_offsetX,src_offsetY);			
			int index=(m+1)*4+n+1;
			tmppixels[index]=value;
		}
	}
	//float4 value=updateCoefficients_getValue(tmppixels,v,u);	
	float4 value=updateCoefficients_getValue(tmppixels,v,u);	
	return value;
	
}

float4 rotate_bicubic(image2d_t src_data,float srcColIndex,float srcRowIndex,float src_offsetX,float src_offsetY,float2 resolution,float matt){
	float width=resolution.x;
	float height=resolution.y;
	float srcCol=min(srcColIndex,width - 1);
	float srcRow=min(srcRowIndex,height-1);
	int intCol=floor(srcCol);
	int intRow=floor(srcRow);
	float u=srcCol-intCol;
	float v=srcRow-intRow;
	
	float4 tmppixels[]={
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
	};
	 
	// 16 neiber
	for (int m = -1; m <= 2; m += 1) {
		for (int n = -1; n <= 2; n += 1) {				
			float4 value = getRGBAValue_normalizeCoord(src_data,intRow+m,intCol + n,src_offsetX,src_offsetY,resolution,matt);	
			
			int index=(m+1)*4+n+1;
			tmppixels[index]=value;
		}
	}
	float4 value=updateCoefficients_getValue(tmppixels,v,u);	
	return value;
	
}

float4 rotate_bicubic1(image2d_t src_data, float dstColIndex,float dstRowIndex,float width,float height,float srcColIndex,float srcRowIndex,float src_offsetX,float src_offsetY,float2 resolution,float matt){
	float srcCol=min(srcColIndex,width - 1);
	float srcRow=min(srcRowIndex,height-1);
	int intCol=floor(srcCol);
	int intRow=floor(srcRow);
	float u=srcCol-intCol;
	float v=srcRow-intRow;
	
	float4 tmppixels[]={
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
		(float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f),
	};
	 
	// 16 neiber
	for (int m = -1; m <= 2; m += 1) {
		for (int n = -1; n <= 2; n += 1) {
			//float4 value = getRGBAValue(src_data,width,height,intRow+m,intCol + n,src_offsetX,src_offsetY);	
			float4 value = getRGBAValue_normalizeCoord1(src_data,width,height,intRow+m,intCol + n,src_offsetX,src_offsetY,resolution,matt);	
			
			int index=(m+1)*4+n+1;
			tmppixels[index]=value;
		}
	}
	float4 value=updateCoefficients_getValue(tmppixels,v,u);	
	return value;	
}

float2 rotateFunc(float2 uv, float2 center, float theta)
{
	float2 temp;
	temp.x = dot((float2)(cos(theta), -sin(theta)), uv - center);
	temp.y = dot((float2)(sin(theta), cos(theta)), uv - center);
	return (temp+center);
}
float my_fmod(float x,float y){
	return x - y * floor (x/y);
}

__kernel void MAIN(__read_only image2d_t overlay, __write_only image2d_t dest_data,  __global FilterParam* param,  float theta,int samperType)
{	
	const float eps = 1.0e-10f;
	int2 coordinate = (int2)(get_global_id(0), get_global_id(1));

	float2 resolution = (float2)((float)(param->width[1]),(float)(param->height[1]));
	float2 overlayRes = (float2)((float)(param->width[0]),(float)(param->height[0]));
	
	float2 fragCoord = (float2)(coordinate.x, coordinate.y)+(float2)(0.5f,0.5f);
	
	float2 tc = fragCoord/resolution.xy;
	float2 tempTc = tc;
		
	float matt = 1.0f;
	
	float4 ovlCol=(float4)(0.0f); 
	
	float roiX0 = param->origROI[0];
	float roiY0 = param->origROI[1];
	float roiX1 = param->origROI[2] + param->origROI[0];
	float roiY1 = param->origROI[3] + param->origROI[1];
		
	float resultX0 = param->resultROI[0];
	float resultY0 = param->resultROI[1];
	float resultX1 = param->resultROI[2] + param->resultROI[0];
	float resultY1 = param->resultROI[3] + param->resultROI[1];	
	
	//samperType=0:nearest
	//samperType=1:linear
	//samperType=2:bicubic
	
	float2 roiCenter = (float2)((roiX1-roiX0)*0.5f + roiX0, (roiY1-roiY0)*0.5f + roiY0);//overlay
	float2 resultRoiCenter = (float2)((resultX1 - resultX0)*0.5f + resultX0, (resultY1 - resultY0)*0.5f + resultY0);
	float2 transl =  resultRoiCenter - roiCenter;//
	
	float scalFactorX = (resultX1 - resultX0)/(roiX1 - roiX0);
	float scalFactorY = (resultY1 - resultY0)/(roiY1 - roiY0);
	
	//float ftheta=my_fmod(theta,360.0f);
	float _theta = -0.0174532925199433f*theta;
	tc = tc  - transl;
	float2 center = roiCenter;
	tc = rotateFunc(tc*resolution.xy,resolution.xy*center,_theta)/resolution.xy;//rotate
	float2  renderModeDirectCor = tc;
	tc.x = ( tc.x - center.x )/(scalFactorX) + center.x ;//scale
	tc.y = ( tc.y - center.y )/(scalFactorY) + center.y;//scale
	
	matt = step(roiX0,tc.x)*step(tc.x, roiX1 )*step(roiY0,tc.y)*step(tc.y, roiY1);//roi
	if(samperType==1){//linear
		float srcWidth =overlayRes.x;
        float srcHeight = overlayRes.y;       
        float one_PixelX = 1.0f/srcWidth;
        float one_PixelY = 1.0f/srcHeight;
        float featherMatt = 1.0f;
		if(fabs(theta) > 1.0e-5f){           
        
            float featherX = smoothstep(roiX0, roiX0 + one_PixelX, tc.x) * (1.0f - smoothstep(roiX1 - one_PixelX, roiX1, tc.x));
            float featherY = smoothstep(roiY0, roiY0 + one_PixelY, tc.y) * (1.0f - smoothstep(roiY1 - one_PixelY, roiY1, tc.y));
            featherMatt = featherX*featherY;           
        }
        
        ovlCol = read_imagef(overlay, samplerOVL, tc) *matt*featherMatt;
	}else if(samperType==0){ //nearest
		//tc*=resolution.xy;
		//ovlCol = read_imagef(overlay, samplerOVLN, (int2)(tc.x,tc.y)) * matt;
		ovlCol = read_imagef(overlay, samplerBG, tc) * matt;
	}else { //bicubic	
			float pixelResX=resultX0*resolution.x+0.5f;
			float pixelResY=resultY0*resolution.y+0.5f;					
			tc*=resolution.xy;				
			// ovlCol=rotate_bicubic(overlay,(float)(coordinate.x-pixelResX),(float)(coordinate.y-pixelResY),overlayRes.x,overlayRes.y,tc.x,tc.y,0.0f,0.0f,resolution,matt)*matt;
			ovlCol=rotate_bicubic(overlay,tc.x,tc.y,0.0f,0.0f,resolution,matt)*matt;
			//ovlCol=(float4)(1.0f,0.0f,0.0f,1.0f);			
	}	
	write_imagef(dest_data, coordinate, ovlCol);
}