| /* |
| Copyright 1999-2017 ImageMagick Studio LLC, a non-profit organization |
| dedicated to making software imaging solutions freely available. |
| |
| You may not use this file except in compliance with the License. |
| obtain a copy of the License at |
| |
| https://www.imagemagick.org/script/license.php |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| |
| MagickCore private kernels for accelerated functions. |
| */ |
| |
| #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H |
| #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H |
| |
| #if defined(__cplusplus) || defined(c_plusplus) |
| extern "C" { |
| #endif |
| |
| #if defined(MAGICKCORE_OPENCL_SUPPORT) |
| |
| /* |
| Define declarations. |
| */ |
| #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n" |
| #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n" |
| #define OPENCL_ELSE() "\n #""else " " \n" |
| #define OPENCL_ENDIF() "\n #""endif " " \n" |
| #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n" |
| #define STRINGIFY(...) #__VA_ARGS__ "\n" |
| |
| const char *accelerateKernels = |
| |
| /* |
| Define declarations. |
| */ |
| OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) |
| OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f)) |
| OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) |
| OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) |
| OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) |
| OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) |
| OPENCL_DEFINE(SigmaRandom, (attenuate)) |
| OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) |
| OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y))) |
| OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y))) |
| |
| /* |
| Typedef declarations. |
| */ |
| STRINGIFY( |
| typedef enum |
| { |
| UndefinedColorspace, |
| RGBColorspace, /* Linear RGB colorspace */ |
| GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */ |
| TransparentColorspace, |
| OHTAColorspace, |
| LabColorspace, |
| XYZColorspace, |
| YCbCrColorspace, |
| YCCColorspace, |
| YIQColorspace, |
| YPbPrColorspace, |
| YUVColorspace, |
| CMYKColorspace, /* negared linear RGB with black separated */ |
| sRGBColorspace, /* Default: non-lienar sRGB colorspace */ |
| HSBColorspace, |
| HSLColorspace, |
| HWBColorspace, |
| Rec601LumaColorspace, |
| Rec601YCbCrColorspace, |
| Rec709LumaColorspace, |
| Rec709YCbCrColorspace, |
| LogColorspace, |
| CMYColorspace, /* negated linear RGB colorspace */ |
| LuvColorspace, |
| HCLColorspace, |
| LCHColorspace, /* alias for LCHuv */ |
| LMSColorspace, |
| LCHabColorspace, /* Cylindrical (Polar) Lab */ |
| LCHuvColorspace, /* Cylindrical (Polar) Luv */ |
| scRGBColorspace, |
| HSIColorspace, |
| HSVColorspace, /* alias for HSB */ |
| HCLpColorspace, |
| YDbDrColorspace |
| } ColorspaceType; |
| ) |
| |
| STRINGIFY( |
| typedef enum |
| { |
| UndefinedCompositeOp, |
| NoCompositeOp, |
| ModulusAddCompositeOp, |
| AtopCompositeOp, |
| BlendCompositeOp, |
| BumpmapCompositeOp, |
| ChangeMaskCompositeOp, |
| ClearCompositeOp, |
| ColorBurnCompositeOp, |
| ColorDodgeCompositeOp, |
| ColorizeCompositeOp, |
| CopyBlackCompositeOp, |
| CopyBlueCompositeOp, |
| CopyCompositeOp, |
| CopyCyanCompositeOp, |
| CopyGreenCompositeOp, |
| CopyMagentaCompositeOp, |
| CopyOpacityCompositeOp, |
| CopyRedCompositeOp, |
| CopyYellowCompositeOp, |
| DarkenCompositeOp, |
| DstAtopCompositeOp, |
| DstCompositeOp, |
| DstInCompositeOp, |
| DstOutCompositeOp, |
| DstOverCompositeOp, |
| DifferenceCompositeOp, |
| DisplaceCompositeOp, |
| DissolveCompositeOp, |
| ExclusionCompositeOp, |
| HardLightCompositeOp, |
| HueCompositeOp, |
| InCompositeOp, |
| LightenCompositeOp, |
| LinearLightCompositeOp, |
| LuminizeCompositeOp, |
| MinusDstCompositeOp, |
| ModulateCompositeOp, |
| MultiplyCompositeOp, |
| OutCompositeOp, |
| OverCompositeOp, |
| OverlayCompositeOp, |
| PlusCompositeOp, |
| ReplaceCompositeOp, |
| SaturateCompositeOp, |
| ScreenCompositeOp, |
| SoftLightCompositeOp, |
| SrcAtopCompositeOp, |
| SrcCompositeOp, |
| SrcInCompositeOp, |
| SrcOutCompositeOp, |
| SrcOverCompositeOp, |
| ModulusSubtractCompositeOp, |
| ThresholdCompositeOp, |
| XorCompositeOp, |
| /* These are new operators, added after the above was last sorted. |
| * The list should be re-sorted only when a new library version is |
| * created. |
| */ |
| DivideDstCompositeOp, |
| DistortCompositeOp, |
| BlurCompositeOp, |
| PegtopLightCompositeOp, |
| VividLightCompositeOp, |
| PinLightCompositeOp, |
| LinearDodgeCompositeOp, |
| LinearBurnCompositeOp, |
| MathematicsCompositeOp, |
| DivideSrcCompositeOp, |
| MinusSrcCompositeOp, |
| DarkenIntensityCompositeOp, |
| LightenIntensityCompositeOp |
| } CompositeOperator; |
| ) |
| |
| STRINGIFY( |
| typedef enum |
| { |
| UndefinedFunction, |
| ArcsinFunction, |
| ArctanFunction, |
| PolynomialFunction, |
| SinusoidFunction |
| } MagickFunction; |
| ) |
| |
| STRINGIFY( |
| typedef enum |
| { |
| UndefinedNoise, |
| UniformNoise, |
| GaussianNoise, |
| MultiplicativeGaussianNoise, |
| ImpulseNoise, |
| LaplacianNoise, |
| PoissonNoise, |
| RandomNoise |
| } NoiseType; |
| ) |
| |
| STRINGIFY( |
| typedef enum |
| { |
| UndefinedPixelIntensityMethod = 0, |
| AveragePixelIntensityMethod, |
| BrightnessPixelIntensityMethod, |
| LightnessPixelIntensityMethod, |
| Rec601LumaPixelIntensityMethod, |
| Rec601LuminancePixelIntensityMethod, |
| Rec709LumaPixelIntensityMethod, |
| Rec709LuminancePixelIntensityMethod, |
| RMSPixelIntensityMethod, |
| MSPixelIntensityMethod |
| } PixelIntensityMethod; |
| ) |
| |
| STRINGIFY( |
| typedef enum { |
| BoxWeightingFunction = 0, |
| TriangleWeightingFunction, |
| CubicBCWeightingFunction, |
| HannWeightingFunction, |
| HammingWeightingFunction, |
| BlackmanWeightingFunction, |
| GaussianWeightingFunction, |
| QuadraticWeightingFunction, |
| JincWeightingFunction, |
| SincWeightingFunction, |
| SincFastWeightingFunction, |
| KaiserWeightingFunction, |
| WelshWeightingFunction, |
| BohmanWeightingFunction, |
| LagrangeWeightingFunction, |
| CosineWeightingFunction, |
| } ResizeWeightingFunctionType; |
| ) |
| |
| STRINGIFY( |
| typedef enum |
| { |
| UndefinedChannel = 0x0000, |
| RedChannel = 0x0001, |
| GrayChannel = 0x0001, |
| CyanChannel = 0x0001, |
| GreenChannel = 0x0002, |
| MagentaChannel = 0x0002, |
| BlueChannel = 0x0004, |
| YellowChannel = 0x0004, |
| BlackChannel = 0x0008, |
| AlphaChannel = 0x0010, |
| OpacityChannel = 0x0010, |
| IndexChannel = 0x0020, /* Color Index Table? */ |
| ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */ |
| WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */ |
| MetaChannel = 0x0100, /* ???? */ |
| CompositeChannels = 0x001F, |
| AllChannels = 0x7ffffff, |
| /* |
| Special purpose channel types. |
| FUTURE: are these needed any more - they are more like hacks |
| SyncChannels for example is NOT a real channel but a 'flag' |
| It really says -- "User has not defined channels" |
| Though it does have extra meaning in the "-auto-level" operator |
| */ |
| TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */ |
| RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */ |
| GrayChannels = 0x0400, |
| SyncChannels = 0x20000, /* channels modified as a single unit */ |
| DefaultChannels = AllChannels |
| } ChannelType; /* must correspond to PixelChannel */ |
| ) |
| |
| /* |
| Helper functions. |
| */ |
| |
| OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8)) |
| |
| STRINGIFY( |
| inline CLQuantum ScaleCharToQuantum(const unsigned char value) |
| { |
| return((CLQuantum) value); |
| } |
| ) |
| |
| OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16)) |
| |
| STRINGIFY( |
| inline CLQuantum ScaleCharToQuantum(const unsigned char value) |
| { |
| return((CLQuantum) (257.0f*value)); |
| } |
| ) |
| |
| OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32)) |
| |
| STRINGIFY( |
| inline CLQuantum ScaleCharToQuantum(const unsigned char value) |
| { |
| return((CLQuantum) (16843009.0*value)); |
| } |
| ) |
| |
| OPENCL_ENDIF() |
| |
| STRINGIFY( |
| inline int ClampToCanvas(const int offset,const int range) |
| { |
| return clamp(offset, (int)0, range-1); |
| } |
| ) |
| |
| STRINGIFY( |
| inline CLQuantum ClampToQuantum(const float value) |
| { |
| return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f); |
| } |
| ) |
| |
| STRINGIFY( |
| inline uint ScaleQuantumToMap(CLQuantum value) |
| { |
| if (value >= (CLQuantum) MaxMap) |
| return ((uint)MaxMap); |
| else |
| return ((uint)value); |
| } |
| ) |
| |
| STRINGIFY( |
| inline float PerceptibleReciprocal(const float x) |
| { |
| float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0; |
| return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon)); |
| } |
| ) |
| |
| STRINGIFY( |
| inline float RoundToUnity(const float value) |
| { |
| return clamp(value,0.0f,1.0f); |
| } |
| ) |
| |
| STRINGIFY( |
| |
| inline unsigned int getPixelIndex(const unsigned int number_channels, |
| const unsigned int columns, const unsigned int x, const unsigned int y) |
| { |
| return (x * number_channels) + (y * columns * number_channels); |
| } |
| |
| inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; } |
| inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); } |
| inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); } |
| inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); } |
| |
| inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; } |
| inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; } |
| inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; } |
| inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; } |
| |
| inline CLQuantum getBlue(CLPixelType p) { return p.x; } |
| inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; } |
| inline float getBlueF4(float4 p) { return p.x; } |
| inline void setBlueF4(float4* p, float value) { (*p).x = value; } |
| |
| inline CLQuantum getGreen(CLPixelType p) { return p.y; } |
| inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; } |
| inline float getGreenF4(float4 p) { return p.y; } |
| inline void setGreenF4(float4* p, float value) { (*p).y = value; } |
| |
| inline CLQuantum getRed(CLPixelType p) { return p.z; } |
| inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; } |
| inline float getRedF4(float4 p) { return p.z; } |
| inline void setRedF4(float4* p, float value) { (*p).z = value; } |
| |
| inline CLQuantum getAlpha(CLPixelType p) { return p.w; } |
| inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; } |
| inline float getAlphaF4(float4 p) { return p.w; } |
| inline void setAlphaF4(float4* p, float value) { (*p).w = value; } |
| |
| inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels, |
| const ChannelType channel, float *red, float *green, float *blue, float *alpha) |
| { |
| if ((channel & RedChannel) != 0) |
| *red=getPixelRed(p); |
| |
| if (number_channels > 2) |
| { |
| if ((channel & GreenChannel) != 0) |
| *green=getPixelGreen(p); |
| |
| if ((channel & BlueChannel) != 0) |
| *blue=getPixelBlue(p); |
| } |
| |
| if (((number_channels == 4) || (number_channels == 2)) && |
| ((channel & AlphaChannel) != 0)) |
| *alpha=getPixelAlpha(p,number_channels); |
| } |
| |
| inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels, |
| const unsigned int columns, const unsigned int x, const unsigned int y) |
| { |
| const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); |
| |
| float4 pixel; |
| |
| pixel.x=getPixelRed(p); |
| |
| if (number_channels > 2) |
| { |
| pixel.y=getPixelGreen(p); |
| pixel.z=getPixelBlue(p); |
| } |
| |
| if ((number_channels == 4) || (number_channels == 2)) |
| pixel.w=getPixelAlpha(p,number_channels); |
| return(pixel); |
| } |
| |
| inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels, |
| const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel) |
| { |
| const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); |
| |
| float red = 0.0f; |
| float green = 0.0f; |
| float blue = 0.0f; |
| float alpha = 0.0f; |
| |
| ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); |
| return (float4)(red, green, blue, alpha); |
| } |
| |
| inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels, |
| const ChannelType channel, float red, float green, float blue, float alpha) |
| { |
| if ((channel & RedChannel) != 0) |
| setPixelRed(p,ClampToQuantum(red)); |
| |
| if (number_channels > 2) |
| { |
| if ((channel & GreenChannel) != 0) |
| setPixelGreen(p,ClampToQuantum(green)); |
| |
| if ((channel & BlueChannel) != 0) |
| setPixelBlue(p,ClampToQuantum(blue)); |
| } |
| |
| if (((number_channels == 4) || (number_channels == 2)) && |
| ((channel & AlphaChannel) != 0)) |
| setPixelAlpha(p,number_channels,ClampToQuantum(alpha)); |
| } |
| |
| inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels, |
| const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel) |
| { |
| __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); |
| |
| setPixelRed(p,ClampToQuantum(pixel.x)); |
| |
| if (number_channels > 2) |
| { |
| setPixelGreen(p,ClampToQuantum(pixel.y)); |
| setPixelBlue(p,ClampToQuantum(pixel.z)); |
| } |
| |
| if ((number_channels == 4) || (number_channels == 2)) |
| setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w)); |
| } |
| |
| inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels, |
| const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel, |
| float4 pixel) |
| { |
| __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); |
| WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w); |
| } |
| |
| inline float GetPixelIntensity(const unsigned int colorspace, |
| const unsigned int method,float red,float green,float blue) |
| { |
| float intensity; |
| |
| if (colorspace == GRAYColorspace) |
| return red; |
| |
| switch (method) |
| { |
| case AveragePixelIntensityMethod: |
| { |
| intensity=(red+green+blue)/3.0; |
| break; |
| } |
| case BrightnessPixelIntensityMethod: |
| { |
| intensity=MagickMax(MagickMax(red,green),blue); |
| break; |
| } |
| case LightnessPixelIntensityMethod: |
| { |
| intensity=(MagickMin(MagickMin(red,green),blue)+ |
| MagickMax(MagickMax(red,green),blue))/2.0; |
| break; |
| } |
| case MSPixelIntensityMethod: |
| { |
| intensity=(float) (((float) red*red+green*green+blue*blue)/ |
| (3.0*QuantumRange)); |
| break; |
| } |
| case Rec601LumaPixelIntensityMethod: |
| { |
| /* |
| if (image->colorspace == RGBColorspace) |
| { |
| red=EncodePixelGamma(red); |
| green=EncodePixelGamma(green); |
| blue=EncodePixelGamma(blue); |
| } |
| */ |
| intensity=0.298839*red+0.586811*green+0.114350*blue; |
| break; |
| } |
| case Rec601LuminancePixelIntensityMethod: |
| { |
| /* |
| if (image->colorspace == sRGBColorspace) |
| { |
| red=DecodePixelGamma(red); |
| green=DecodePixelGamma(green); |
| blue=DecodePixelGamma(blue); |
| } |
| */ |
| intensity=0.298839*red+0.586811*green+0.114350*blue; |
| break; |
| } |
| case Rec709LumaPixelIntensityMethod: |
| default: |
| { |
| /* |
| if (image->colorspace == RGBColorspace) |
| { |
| red=EncodePixelGamma(red); |
| green=EncodePixelGamma(green); |
| blue=EncodePixelGamma(blue); |
| } |
| */ |
| intensity=0.212656*red+0.715158*green+0.072186*blue; |
| break; |
| } |
| case Rec709LuminancePixelIntensityMethod: |
| { |
| /* |
| if (image->colorspace == sRGBColorspace) |
| { |
| red=DecodePixelGamma(red); |
| green=DecodePixelGamma(green); |
| blue=DecodePixelGamma(blue); |
| } |
| */ |
| intensity=0.212656*red+0.715158*green+0.072186*blue; |
| break; |
| } |
| case RMSPixelIntensityMethod: |
| { |
| intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/ |
| sqrt(3.0)); |
| break; |
| } |
| } |
| |
| return intensity; |
| } |
| |
| inline int mirrorBottom(int value) |
| { |
| return (value < 0) ? - (value) : value; |
| } |
| |
| inline int mirrorTop(int value, int width) |
| { |
| return (value >= width) ? (2 * width - value - 1) : value; |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % A d d N o i s e % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| /* |
| Part of MWC64X by David Thomas, dt10@imperial.ac.uk |
| This is provided under BSD, full license is with the main package. |
| See http://www.doc.ic.ac.uk/~dt10/research |
| */ |
| |
| // Pre: a<M, b<M |
| // Post: r=(a+b) mod M |
| ulong MWC_AddMod64(ulong a, ulong b, ulong M) |
| { |
| ulong v=a+b; |
| //if( (v>=M) || (v<a) ) |
| if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug. |
| v=v-M; |
| return v; |
| } |
| |
| // Pre: a<M,b<M |
| // Post: r=(a*b) mod M |
| // This could be done more efficently, but it is portable, and should |
| // be easy to understand. It can be replaced with any of the better |
| // modular multiplication algorithms (for example if you know you have |
| // double precision available or something). |
| ulong MWC_MulMod64(ulong a, ulong b, ulong M) |
| { |
| ulong r=0; |
| while(a!=0){ |
| if(a&1) |
| r=MWC_AddMod64(r,b,M); |
| b=MWC_AddMod64(b,b,M); |
| a=a>>1; |
| } |
| return r; |
| } |
| |
| // Pre: a<M, e>=0 |
| // Post: r=(a^b) mod M |
| // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on |
| // most architectures |
| ulong MWC_PowMod64(ulong a, ulong e, ulong M) |
| { |
| ulong sqr=a, acc=1; |
| while(e!=0){ |
| if(e&1) |
| acc=MWC_MulMod64(acc,sqr,M); |
| sqr=MWC_MulMod64(sqr,sqr,M); |
| e=e>>1; |
| } |
| return acc; |
| } |
| |
| uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance) |
| { |
| ulong m=MWC_PowMod64(A, distance, M); |
| ulong x=curr.x*(ulong)A+curr.y; |
| x=MWC_MulMod64(x, m, M); |
| return (uint2)((uint)(x/A), (uint)(x%A)); |
| } |
| |
| uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap) |
| { |
| // This is an arbitrary constant for starting LCG jumping from. I didn't |
| // want to start from 1, as then you end up with the two or three first values |
| // being a bit poor in ones - once you've decided that, one constant is as |
| // good as any another. There is no deep mathematical reason for it, I just |
| // generated a random number. |
| enum{ MWC_BASEID = 4077358422479273989UL }; |
| |
| ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap; |
| ulong m=MWC_PowMod64(A, dist, M); |
| |
| ulong x=MWC_MulMod64(MWC_BASEID, m, M); |
| return (uint2)((uint)(x/A), (uint)(x%A)); |
| } |
| |
| //! Represents the state of a particular generator |
| typedef struct{ uint x; uint c; } mwc64x_state_t; |
| |
| enum{ MWC64X_A = 4294883355U }; |
| enum{ MWC64X_M = 18446383549859758079UL }; |
| |
| void MWC64X_Step(mwc64x_state_t *s) |
| { |
| uint X=s->x, C=s->c; |
| |
| uint Xn=MWC64X_A*X+C; |
| uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar |
| uint Cn=mad_hi(MWC64X_A,X,carry); |
| |
| s->x=Xn; |
| s->c=Cn; |
| } |
| |
| void MWC64X_Skip(mwc64x_state_t *s, ulong distance) |
| { |
| uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance); |
| s->x=tmp.x; |
| s->c=tmp.y; |
| } |
| |
| void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset) |
| { |
| uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset); |
| s->x=tmp.x; |
| s->c=tmp.y; |
| } |
| |
| //! Return a 32-bit integer in the range [0..2^32) |
| uint MWC64X_NextUint(mwc64x_state_t *s) |
| { |
| uint res=s->x ^ s->c; |
| MWC64X_Step(s); |
| return res; |
| } |
| |
| // |
| // End of MWC64X excerpt |
| // |
| |
| float mwcReadPseudoRandomValue(mwc64x_state_t* rng) |
| { |
| return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0 |
| } |
| |
| float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate) |
| { |
| float |
| alpha, |
| beta, |
| noise, |
| sigma; |
| |
| noise = 0.0f; |
| alpha=mwcReadPseudoRandomValue(r); |
| switch(noise_type) |
| { |
| case UniformNoise: |
| default: |
| { |
| noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); |
| break; |
| } |
| case GaussianNoise: |
| { |
| float |
| gamma, |
| tau; |
| |
| if (alpha == 0.0f) |
| alpha=1.0f; |
| beta=mwcReadPseudoRandomValue(r); |
| gamma=sqrt(-2.0f*log(alpha)); |
| sigma=gamma*cospi((2.0f*beta)); |
| tau=gamma*sinpi((2.0f*beta)); |
| noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau; |
| break; |
| } |
| case ImpulseNoise: |
| { |
| if (alpha < (SigmaImpulse/2.0f)) |
| noise=0.0f; |
| else |
| if (alpha >= (1.0f-(SigmaImpulse/2.0f))) |
| noise=QuantumRange; |
| else |
| noise=pixel; |
| break; |
| } |
| case LaplacianNoise: |
| { |
| if (alpha <= 0.5f) |
| { |
| if (alpha <= MagickEpsilon) |
| noise=(pixel-QuantumRange); |
| else |
| noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ |
| 0.5f); |
| break; |
| } |
| beta=1.0f-alpha; |
| if (beta <= (0.5f*MagickEpsilon)) |
| noise=(pixel+QuantumRange); |
| else |
| noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); |
| break; |
| } |
| case MultiplicativeGaussianNoise: |
| { |
| sigma=1.0f; |
| if (alpha > MagickEpsilon) |
| sigma=sqrt(-2.0f*log(alpha)); |
| beta=mwcReadPseudoRandomValue(r); |
| noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma* |
| cospi((2.0f*beta))/2.0f); |
| break; |
| } |
| case PoissonNoise: |
| { |
| float |
| poisson; |
| unsigned int i; |
| poisson=exp(-SigmaPoisson*QuantumScale*pixel); |
| for (i=0; alpha > poisson; i++) |
| { |
| beta=mwcReadPseudoRandomValue(r); |
| alpha*=beta; |
| } |
| noise=(QuantumRange*i/SigmaPoisson); |
| break; |
| } |
| case RandomNoise: |
| { |
| noise=(QuantumRange*SigmaRandom*alpha); |
| break; |
| } |
| } |
| return noise; |
| } |
| |
| __kernel |
| void AddNoise(const __global CLQuantum *image, |
| const unsigned int number_channels,const ChannelType channel, |
| const unsigned int length,const unsigned int pixelsPerWorkItem, |
| const NoiseType noise_type,const float attenuate,const unsigned int seed0, |
| const unsigned int seed1,const unsigned int numRandomNumbersPerPixel, |
| __global CLQuantum *filteredImage) |
| { |
| mwc64x_state_t rng; |
| rng.x = seed0; |
| rng.c = seed1; |
| |
| uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use |
| uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream); |
| MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams |
| |
| uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels); |
| uint count = pixelsPerWorkItem; |
| |
| while (count > 0 && pos < length) |
| { |
| const __global CLQuantum *p = image + pos; |
| __global CLQuantum *q = filteredImage + pos; |
| |
| float red; |
| float green; |
| float blue; |
| float alpha; |
| |
| ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); |
| |
| if ((channel & RedChannel) != 0) |
| red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate); |
| |
| if (number_channels > 2) |
| { |
| if ((channel & GreenChannel) != 0) |
| green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate); |
| |
| if ((channel & BlueChannel) != 0) |
| blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate); |
| } |
| |
| if (((number_channels == 4) || (number_channels == 2)) && |
| ((channel & AlphaChannel) != 0)) |
| alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate); |
| |
| WriteChannels(q, number_channels, channel, red, green, blue, alpha); |
| |
| pos += (get_local_size(0) * number_channels); |
| count--; |
| } |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % B l u r % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| /* |
| Reduce image noise and reduce detail levels by row |
| */ |
| __kernel void BlurRow(const __global CLQuantum *image, |
| const unsigned int number_channels,const ChannelType channel, |
| __constant float *filter,const unsigned int width, |
| const unsigned int imageColumns,const unsigned int imageRows, |
| __local float4 *temp,__global float4 *tempImage) |
| { |
| const int x = get_global_id(0); |
| const int y = get_global_id(1); |
| |
| const int columns = imageColumns; |
| |
| const unsigned int radius = (width-1)/2; |
| const int wsize = get_local_size(0); |
| const unsigned int loadSize = wsize+width; |
| |
| //group coordinate |
| const int groupX=get_local_size(0)*get_group_id(0); |
| |
| //parallel load and clamp |
| for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0)) |
| { |
| int cx = ClampToCanvas(i + groupX - radius, columns); |
| temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel); |
| } |
| |
| // barrier |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| // only do the work if this is not a patched item |
| if (get_global_id(0) < columns) |
| { |
| // compute |
| float4 result = (float4) 0; |
| |
| int i = 0; |
| |
| for ( ; i+7 < width; ) |
| { |
| for (int j=0; j < 8; j++) |
| result+=filter[i+j]*temp[i+j+get_local_id(0)]; |
| i+=8; |
| } |
| |
| for ( ; i < width; i++) |
| result+=filter[i]*temp[i+get_local_id(0)]; |
| |
| // write back to global |
| tempImage[y*columns+x] = result; |
| } |
| } |
| ) |
| |
| STRINGIFY( |
| /* |
| Reduce image noise and reduce detail levels by line |
| */ |
| __kernel void BlurColumn(const __global float4 *blurRowData, |
| const unsigned int number_channels,const ChannelType channel, |
| __constant float *filter,const unsigned int width, |
| const unsigned int imageColumns,const unsigned int imageRows, |
| __local float4 *temp,__global CLQuantum *filteredImage) |
| { |
| const int x = get_global_id(0); |
| const int y = get_global_id(1); |
| |
| const int columns = imageColumns; |
| const int rows = imageRows; |
| |
| unsigned int radius = (width-1)/2; |
| const int wsize = get_local_size(1); |
| const unsigned int loadSize = wsize+width; |
| |
| //group coordinate |
| const int groupX=get_local_size(0)*get_group_id(0); |
| const int groupY=get_local_size(1)*get_group_id(1); |
| //notice that get_local_size(0) is 1, so |
| //groupX=get_group_id(0); |
| |
| //parallel load and clamp |
| for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1)) |
| temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX]; |
| |
| // barrier |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| // only do the work if this is not a patched item |
| if (get_global_id(1) < rows) |
| { |
| // compute |
| float4 result = (float4) 0; |
| |
| int i = 0; |
| |
| for ( ; i+7 < width; ) |
| { |
| for (int j=0; j < 8; j++) |
| result+=filter[i+j]*temp[i+j+get_local_id(1)]; |
| i+=8; |
| } |
| |
| for ( ; i < width; i++) |
| result+=filter[i]*temp[i+get_local_id(1)]; |
| |
| // write back to global |
| WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); |
| } |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % C o n t r a s t % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| |
| inline float4 ConvertRGBToHSB(const float4 pixel) |
| { |
| float4 result=0.0f; |
| result.w=pixel.w; |
| float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z); |
| if (tmax != 0.0f) |
| { |
| float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z); |
| float delta=tmax-tmin; |
| |
| result.y=delta/tmax; |
| result.z=QuantumScale*tmax; |
| if (delta != 0.0f) |
| { |
| result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f)); |
| result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ? |
| (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta; |
| result.x/=6.0f; |
| result.x+=(result.x < 0.0f) ? 0.0f : 1.0f; |
| } |
| } |
| return(result); |
| } |
| |
| inline float4 ConvertHSBToRGB(const float4 pixel) |
| { |
| float hue=pixel.x; |
| float saturation=pixel.y; |
| float brightness=pixel.z; |
| |
| float4 result=pixel; |
| |
| if (saturation == 0.0f) |
| { |
| result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness); |
| } |
| else |
| { |
| float h=6.0f*(hue-floor(hue)); |
| float f=h-floor(h); |
| float p=brightness*(1.0f-saturation); |
| float q=brightness*(1.0f-saturation*f); |
| float t=brightness*(1.0f-(saturation*(1.0f-f))); |
| int ih = (int)h; |
| |
| if (ih == 1) |
| { |
| result.x=ClampToQuantum(QuantumRange*q); |
| result.y=ClampToQuantum(QuantumRange*brightness); |
| result.z=ClampToQuantum(QuantumRange*p); |
| } |
| else if (ih == 2) |
| { |
| result.x=ClampToQuantum(QuantumRange*p); |
| result.y=ClampToQuantum(QuantumRange*brightness); |
| result.z=ClampToQuantum(QuantumRange*t); |
| } |
| else if (ih == 3) |
| { |
| result.x=ClampToQuantum(QuantumRange*p); |
| result.y=ClampToQuantum(QuantumRange*q); |
| result.z=ClampToQuantum(QuantumRange*brightness); |
| } |
| else if (ih == 4) |
| { |
| result.x=ClampToQuantum(QuantumRange*t); |
| result.y=ClampToQuantum(QuantumRange*p); |
| result.z=ClampToQuantum(QuantumRange*brightness); |
| } |
| else if (ih == 5) |
| { |
| result.x=ClampToQuantum(QuantumRange*brightness); |
| result.y=ClampToQuantum(QuantumRange*p); |
| result.z=ClampToQuantum(QuantumRange*q); |
| } |
| else |
| { |
| result.x=ClampToQuantum(QuantumRange*brightness); |
| result.y=ClampToQuantum(QuantumRange*t); |
| result.z=ClampToQuantum(QuantumRange*p); |
| } |
| } |
| return(result); |
| } |
| |
| __kernel void Contrast(__global CLQuantum *image, |
| const unsigned int number_channels,const int sign) |
| { |
| const int x=get_global_id(0); |
| const int y=get_global_id(1); |
| const unsigned int columns=get_global_size(0); |
| |
| float4 pixel=ReadAllChannels(image,number_channels,columns,x,y); |
| if (number_channels < 3) |
| pixel.y=pixel.z=pixel.x; |
| |
| pixel=ConvertRGBToHSB(pixel); |
| float brightness=pixel.z; |
| brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness); |
| brightness=clamp(brightness,0.0f,1.0f); |
| pixel.z=brightness; |
| pixel=ConvertHSBToRGB(pixel); |
| |
| WriteAllChannels(image,number_channels,columns,x,y,pixel); |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % C o n t r a s t S t r e t c h % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| /* |
| */ |
| __kernel void Histogram(__global CLPixelType * restrict im, |
| const ChannelType channel, |
| const unsigned int colorspace, |
| const unsigned int method, |
| __global uint4 * restrict histogram) |
| { |
| const int x = get_global_id(0); |
| const int y = get_global_id(1); |
| const int columns = get_global_size(0); |
| const int c = x + y * columns; |
| if ((channel & SyncChannels) != 0) |
| { |
| float red=(float)getRed(im[c]); |
| float green=(float)getGreen(im[c]); |
| float blue=(float)getBlue(im[c]); |
| |
| float intensity = GetPixelIntensity(colorspace, method, red, green, blue); |
| uint pos = ScaleQuantumToMap(ClampToQuantum(intensity)); |
| atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position |
| } |
| else |
| { |
| // for equalizing, we always need all channels? |
| // otherwise something more |
| } |
| } |
| ) |
| |
| STRINGIFY( |
| /* |
| */ |
| __kernel void ContrastStretch(__global CLPixelType * restrict im, |
| const ChannelType channel, |
| __global CLPixelType * restrict stretch_map, |
| const float4 white, const float4 black) |
| { |
| const int x = get_global_id(0); |
| const int y = get_global_id(1); |
| const int columns = get_global_size(0); |
| const int c = x + y * columns; |
| |
| uint ePos; |
| CLPixelType oValue, eValue; |
| CLQuantum red, green, blue, alpha; |
| |
| //read from global |
| oValue=im[c]; |
| |
| if ((channel & RedChannel) != 0) |
| { |
| if (getRedF4(white) != getRedF4(black)) |
| { |
| ePos = ScaleQuantumToMap(getRed(oValue)); |
| eValue = stretch_map[ePos]; |
| red = getRed(eValue); |
| } |
| } |
| |
| if ((channel & GreenChannel) != 0) |
| { |
| if (getGreenF4(white) != getGreenF4(black)) |
| { |
| ePos = ScaleQuantumToMap(getGreen(oValue)); |
| eValue = stretch_map[ePos]; |
| green = getGreen(eValue); |
| } |
| } |
| |
| if ((channel & BlueChannel) != 0) |
| { |
| if (getBlueF4(white) != getBlueF4(black)) |
| { |
| ePos = ScaleQuantumToMap(getBlue(oValue)); |
| eValue = stretch_map[ePos]; |
| blue = getBlue(eValue); |
| } |
| } |
| |
| if ((channel & AlphaChannel) != 0) |
| { |
| if (getAlphaF4(white) != getAlphaF4(black)) |
| { |
| ePos = ScaleQuantumToMap(getAlpha(oValue)); |
| eValue = stretch_map[ePos]; |
| alpha = getAlpha(eValue); |
| } |
| } |
| |
| //write back |
| im[c]=(CLPixelType)(blue, green, red, alpha); |
| |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % C o n v o l v e % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| __kernel |
| void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output, |
| const unsigned int imageWidth, const unsigned int imageHeight, |
| __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, |
| const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) { |
| |
| int2 blockID; |
| blockID.x = get_global_id(0) / get_local_size(0); |
| blockID.y = get_global_id(1) / get_local_size(1); |
| |
| // image area processed by this workgroup |
| int2 imageAreaOrg; |
| imageAreaOrg.x = blockID.x * get_local_size(0); |
| imageAreaOrg.y = blockID.y * get_local_size(1); |
| |
| int2 midFilterDimen; |
| midFilterDimen.x = (filterWidth-1)/2; |
| midFilterDimen.y = (filterHeight-1)/2; |
| |
| int2 cachedAreaOrg = imageAreaOrg - midFilterDimen; |
| |
| // dimension of the local cache |
| int2 cachedAreaDimen; |
| cachedAreaDimen.x = get_local_size(0) + filterWidth - 1; |
| cachedAreaDimen.y = get_local_size(1) + filterHeight - 1; |
| |
| // cache the pixels accessed by this workgroup in local memory |
| int localID = get_local_id(1)*get_local_size(0)+get_local_id(0); |
| int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y; |
| int groupSize = get_local_size(0) * get_local_size(1); |
| for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) { |
| |
| int2 cachedAreaIndex; |
| cachedAreaIndex.x = i % cachedAreaDimen.x; |
| cachedAreaIndex.y = i / cachedAreaDimen.x; |
| |
| int2 imagePixelIndex; |
| imagePixelIndex = cachedAreaOrg + cachedAreaIndex; |
| |
| // only support EdgeVirtualPixelMethod through ClampToCanvas |
| // TODO: implement other virtual pixel method |
| imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth); |
| imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight); |
| |
| pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x]; |
| } |
| |
| // cache the filter |
| for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) { |
| filterCache[i] = filter[i]; |
| } |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| |
| int2 imageIndex; |
| imageIndex.x = imageAreaOrg.x + get_local_id(0); |
| imageIndex.y = imageAreaOrg.y + get_local_id(1); |
| |
| // if out-of-range, stops here and quit |
| if (imageIndex.x >= imageWidth |
| || imageIndex.y >= imageHeight) { |
| return; |
| } |
| |
| int filterIndex = 0; |
| float4 sum = (float4)0.0f; |
| float gamma = 0.0f; |
| if (((channel & AlphaChannel) == 0) || (matte == 0)) { |
| int cacheIndexY = get_local_id(1); |
| for (int j = 0; j < filterHeight; j++) { |
| int cacheIndexX = get_local_id(0); |
| for (int i = 0; i < filterWidth; i++) { |
| CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; |
| float f = filterCache[filterIndex]; |
| |
| sum.x += f * p.x; |
| sum.y += f * p.y; |
| sum.z += f * p.z; |
| sum.w += f * p.w; |
| |
| gamma += f; |
| filterIndex++; |
| cacheIndexX++; |
| } |
| cacheIndexY++; |
| } |
| } |
| else { |
| int cacheIndexY = get_local_id(1); |
| for (int j = 0; j < filterHeight; j++) { |
| int cacheIndexX = get_local_id(0); |
| for (int i = 0; i < filterWidth; i++) { |
| |
| CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; |
| float alpha = QuantumScale*p.w; |
| float f = filterCache[filterIndex]; |
| float g = alpha * f; |
| |
| sum.x += g*p.x; |
| sum.y += g*p.y; |
| sum.z += g*p.z; |
| sum.w += f*p.w; |
| |
| gamma += g; |
| filterIndex++; |
| cacheIndexX++; |
| } |
| cacheIndexY++; |
| } |
| gamma = PerceptibleReciprocal(gamma); |
| sum.xyz = gamma*sum.xyz; |
| } |
| CLPixelType outputPixel; |
| outputPixel.x = ClampToQuantum(sum.x); |
| outputPixel.y = ClampToQuantum(sum.y); |
| outputPixel.z = ClampToQuantum(sum.z); |
| outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; |
| |
| output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; |
| } |
| ) |
| |
| STRINGIFY( |
| __kernel |
| void Convolve(const __global CLPixelType *input, __global CLPixelType *output, |
| const uint imageWidth, const uint imageHeight, |
| __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, |
| const uint matte, const ChannelType channel) { |
| |
| int2 imageIndex; |
| imageIndex.x = get_global_id(0); |
| imageIndex.y = get_global_id(1); |
| |
| /* |
| unsigned int imageWidth = get_global_size(0); |
| unsigned int imageHeight = get_global_size(1); |
| */ |
| if (imageIndex.x >= imageWidth |
| || imageIndex.y >= imageHeight) |
| return; |
| |
| int2 midFilterDimen; |
| midFilterDimen.x = (filterWidth-1)/2; |
| midFilterDimen.y = (filterHeight-1)/2; |
| |
| int filterIndex = 0; |
| float4 sum = (float4)0.0f; |
| float gamma = 0.0f; |
| if (((channel & AlphaChannel) == 0) || (matte == 0)) { |
| for (int j = 0; j < filterHeight; j++) { |
| int2 inputPixelIndex; |
| inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; |
| inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); |
| for (int i = 0; i < filterWidth; i++) { |
| inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; |
| inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); |
| |
| CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; |
| float f = filter[filterIndex]; |
| |
| sum.x += f * p.x; |
| sum.y += f * p.y; |
| sum.z += f * p.z; |
| sum.w += f * p.w; |
| |
| gamma += f; |
| |
| filterIndex++; |
| } |
| } |
| } |
| else { |
| |
| for (int j = 0; j < filterHeight; j++) { |
| int2 inputPixelIndex; |
| inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; |
| inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); |
| for (int i = 0; i < filterWidth; i++) { |
| inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; |
| inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); |
| |
| CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; |
| float alpha = QuantumScale*p.w; |
| float f = filter[filterIndex]; |
| float g = alpha * f; |
| |
| sum.x += g*p.x; |
| sum.y += g*p.y; |
| sum.z += g*p.z; |
| sum.w += f*p.w; |
| |
| gamma += g; |
| |
| |
| filterIndex++; |
| } |
| } |
| gamma = PerceptibleReciprocal(gamma); |
| sum.xyz = gamma*sum.xyz; |
| } |
| |
| CLPixelType outputPixel; |
| outputPixel.x = ClampToQuantum(sum.x); |
| outputPixel.y = ClampToQuantum(sum.y); |
| outputPixel.z = ClampToQuantum(sum.z); |
| outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; |
| |
| output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % D e s p e c k l e % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| |
| __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage |
| , const unsigned int imageWidth, const unsigned int imageHeight |
| , const int2 offset, const int polarity, const int matte) { |
| |
| int x = get_global_id(0); |
| int y = get_global_id(1); |
| |
| CLPixelType v = inputImage[y*imageWidth+x]; |
| |
| int2 neighbor; |
| neighbor.y = y + offset.y; |
| neighbor.x = x + offset.x; |
| |
| int2 clampedNeighbor; |
| clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); |
| clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); |
| |
| CLPixelType r = (clampedNeighbor.x == neighbor.x |
| && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] |
| :(CLPixelType)0; |
| |
| int sv[4]; |
| sv[0] = (int)v.x; |
| sv[1] = (int)v.y; |
| sv[2] = (int)v.z; |
| sv[3] = (int)v.w; |
| |
| int sr[4]; |
| sr[0] = (int)r.x; |
| sr[1] = (int)r.y; |
| sr[2] = (int)r.z; |
| sr[3] = (int)r.w; |
| |
| if (polarity > 0) { |
| \n #pragma unroll 4\n |
| for (unsigned int i = 0; i < 4; i++) { |
| sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i]; |
| } |
| } |
| else { |
| \n #pragma unroll 4\n |
| for (unsigned int i = 0; i < 4; i++) { |
| sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i]; |
| } |
| |
| } |
| |
| v.x = (CLQuantum)sv[0]; |
| v.y = (CLQuantum)sv[1]; |
| v.z = (CLQuantum)sv[2]; |
| |
| if (matte!=0) |
| v.w = (CLQuantum)sv[3]; |
| |
| outputImage[y*imageWidth+x] = v; |
| |
| } |
| |
| |
| ) |
| |
| STRINGIFY( |
| |
| __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage |
| , const unsigned int imageWidth, const unsigned int imageHeight |
| , const int2 offset, const int polarity, const int matte) { |
| |
| int x = get_global_id(0); |
| int y = get_global_id(1); |
| |
| CLPixelType v = inputImage[y*imageWidth+x]; |
| |
| int2 neighbor, clampedNeighbor; |
| |
| neighbor.y = y + offset.y; |
| neighbor.x = x + offset.x; |
| clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); |
| clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); |
| |
| CLPixelType r = (clampedNeighbor.x == neighbor.x |
| && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] |
| :(CLPixelType)0; |
| |
| |
| neighbor.y = y - offset.y; |
| neighbor.x = x - offset.x; |
| clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); |
| clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); |
| |
| CLPixelType s = (clampedNeighbor.x == neighbor.x |
| && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] |
| :(CLPixelType)0; |
| |
| |
| int sv[4]; |
| sv[0] = (int)v.x; |
| sv[1] = (int)v.y; |
| sv[2] = (int)v.z; |
| sv[3] = (int)v.w; |
| |
| int sr[4]; |
| sr[0] = (int)r.x; |
| sr[1] = (int)r.y; |
| sr[2] = (int)r.z; |
| sr[3] = (int)r.w; |
| |
| int ss[4]; |
| ss[0] = (int)s.x; |
| ss[1] = (int)s.y; |
| ss[2] = (int)s.z; |
| ss[3] = (int)s.w; |
| |
| if (polarity > 0) { |
| \n #pragma unroll 4\n |
| for (unsigned int i = 0; i < 4; i++) { |
| //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i]; |
| // |
| //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); |
| //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); |
| sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); |
| } |
| } |
| else { |
| \n #pragma unroll 4\n |
| for (unsigned int i = 0; i < 4; i++) { |
| //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i]; |
| // |
| //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); |
| sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); |
| } |
| } |
| |
| v.x = (CLQuantum)sv[0]; |
| v.y = (CLQuantum)sv[1]; |
| v.z = (CLQuantum)sv[2]; |
| |
| if (matte!=0) |
| v.w = (CLQuantum)sv[3]; |
| |
| outputImage[y*imageWidth+x] = v; |
| |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % E q u a l i z e % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| /* |
| */ |
| __kernel void Equalize(__global CLPixelType * restrict im, |
| const ChannelType channel, |
| __global CLPixelType * restrict equalize_map, |
| const float4 white, const float4 black) |
| { |
| const int x = get_global_id(0); |
| const int y = get_global_id(1); |
| const int columns = get_global_size(0); |
| const int c = x + y * columns; |
| |
| uint ePos; |
| CLPixelType oValue, eValue; |
| CLQuantum red, green, blue, alpha; |
| |
| //read from global |
| oValue=im[c]; |
| |
| if ((channel & SyncChannels) != 0) |
| { |
| if (getRedF4(white) != getRedF4(black)) |
| { |
| ePos = ScaleQuantumToMap(getRed(oValue)); |
| eValue = equalize_map[ePos]; |
| red = getRed(eValue); |
| ePos = ScaleQuantumToMap(getGreen(oValue)); |
| eValue = equalize_map[ePos]; |
| green = getRed(eValue); |
| ePos = ScaleQuantumToMap(getBlue(oValue)); |
| eValue = equalize_map[ePos]; |
| blue = getRed(eValue); |
| ePos = ScaleQuantumToMap(getAlpha(oValue)); |
| eValue = equalize_map[ePos]; |
| alpha = getRed(eValue); |
| |
| //write back |
| im[c]=(CLPixelType)(blue, green, red, alpha); |
| } |
| |
| } |
| |
| // for equalizing, we always need all channels? |
| // otherwise something more |
| |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % F u n c t i o n % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| /* |
| apply FunctionImageChannel(braightness-contrast) |
| */ |
| CLQuantum ApplyFunction(float pixel,const MagickFunction function, |
| const unsigned int number_parameters,__constant float *parameters) |
| { |
| float result = 0.0f; |
| |
| switch (function) |
| { |
| case PolynomialFunction: |
| { |
| for (unsigned int i=0; i < number_parameters; i++) |
| result = result*QuantumScale*pixel + parameters[i]; |
| result *= QuantumRange; |
| break; |
| } |
| case SinusoidFunction: |
| { |
| float freq,phase,ampl,bias; |
| freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; |
| phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f; |
| ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f; |
| bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; |
| result = QuantumRange*(ampl*sin(2.0f*MagickPI* |
| (freq*QuantumScale*pixel + phase/360.0f)) + bias); |
| break; |
| } |
| case ArcsinFunction: |
| { |
| float width,range,center,bias; |
| width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; |
| center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; |
| range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; |
| bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; |
| |
| result = 2.0f/width*(QuantumScale*pixel - center); |
| result = range/MagickPI*asin(result)+bias; |
| result = ( result <= -1.0f ) ? bias - range/2.0f : result; |
| result = ( result >= 1.0f ) ? bias + range/2.0f : result; |
| result *= QuantumRange; |
| break; |
| } |
| case ArctanFunction: |
| { |
| float slope,range,center,bias; |
| slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; |
| center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; |
| range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; |
| bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; |
| result = MagickPI*slope*(QuantumScale*pixel-center); |
| result = QuantumRange*(range/MagickPI*atan(result) + bias); |
| break; |
| } |
| case UndefinedFunction: |
| break; |
| } |
| return(ClampToQuantum(result)); |
| } |
| ) |
| |
| STRINGIFY( |
| /* |
| Improve brightness / contrast of the image |
| channel : define which channel is improved |
| function : the function called to enchance the brightness contrast |
| number_parameters : numbers of parameters |
| parameters : the parameter |
| */ |
| __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels, |
| const ChannelType channel,const MagickFunction function,const unsigned int number_parameters, |
| __constant float *parameters) |
| { |
| const unsigned int x = get_global_id(0); |
| const unsigned int y = get_global_id(1); |
| const unsigned int columns = get_global_size(0); |
| __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); |
| |
| float red; |
| float green; |
| float blue; |
| float alpha; |
| |
| ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); |
| |
| if ((channel & RedChannel) != 0) |
| red=ApplyFunction(red, function, number_parameters, parameters); |
| |
| if (number_channels > 2) |
| { |
| if ((channel & GreenChannel) != 0) |
| green=ApplyFunction(green, function, number_parameters, parameters); |
| |
| if ((channel & BlueChannel) != 0) |
| blue=ApplyFunction(blue, function, number_parameters, parameters); |
| } |
| |
| if (((number_channels == 4) || (number_channels == 2)) && |
| ((channel & AlphaChannel) != 0)) |
| alpha=ApplyFunction(alpha, function, number_parameters, parameters); |
| |
| WriteChannels(p, number_channels, channel, red, green, blue, alpha); |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % G r a y s c a l e % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| __kernel void Grayscale(__global CLQuantum *image,const int number_channels, |
| const unsigned int colorspace,const unsigned int method) |
| { |
| const unsigned int x = get_global_id(0); |
| const unsigned int y = get_global_id(1); |
| const unsigned int columns = get_global_size(0); |
| __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); |
| |
| float |
| blue, |
| green, |
| red; |
| |
| red=getPixelRed(p); |
| green=getPixelGreen(p); |
| blue=getPixelBlue(p); |
| |
| CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue)); |
| |
| setPixelRed(p,intensity); |
| setPixelGreen(p,intensity); |
| setPixelBlue(p,intensity); |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % L o c a l C o n t r a s t % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| |
| __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage, |
| const int radius, |
| const int imageWidth, |
| const int imageHeight) |
| { |
| const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f)); |
| |
| int x = get_local_id(0); |
| int y = get_global_id(1); |
| |
| if ((x >= imageWidth) || (y >= imageHeight)) |
| return; |
| |
| global CLPixelType *src = srcImage + y * imageWidth; |
| |
| for (int i = x; i < imageWidth; i += get_local_size(0)) { |
| float sum = 0.0f; |
| float weight = 1.0f; |
| |
| int j = i - radius; |
| while ((j + 7) < i) { |
| for (int k = 0; k < 8; ++k) // Unroll 8x |
| sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)])); |
| weight += 8.0f; |
| j+=8; |
| } |
| while (j < i) { |
| sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)])); |
| weight += 1.0f; |
| ++j; |
| } |
| |
| while ((j + 7) < radius + i) { |
| for (int k = 0; k < 8; ++k) // Unroll 8x |
| sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)])); |
| weight -= 8.0f; |
| j+=8; |
| } |
| while (j < radius + i) { |
| sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)])); |
| weight -= 1.0f; |
| ++j; |
| } |
| |
| tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1)); |
| } |
| } |
| ) |
| |
| STRINGIFY( |
| __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage, |
| const int radius, |
| const float strength, |
| const int imageWidth, |
| const int imageHeight) |
| { |
| const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f); |
| |
| int x = get_global_id(0); |
| int y = get_global_id(1); |
| |
| if ((x >= imageWidth) || (y >= imageHeight)) |
| return; |
| |
| global float *src = blurImage + x; |
| |
| float sum = 0.0f; |
| float weight = 1.0f; |
| |
| int j = y - radius; |
| while ((j + 7) < y) { |
| for (int k = 0; k < 8; ++k) // Unroll 8x |
| sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth]; |
| weight += 8.0f; |
| j+=8; |
| } |
| while (j < y) { |
| sum += weight * src[mirrorBottom(j) * imageWidth]; |
| weight += 1.0f; |
| ++j; |
| } |
| |
| while ((j + 7) < radius + y) { |
| for (int k = 0; k < 8; ++k) // Unroll 8x |
| sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth]; |
| weight -= 8.0f; |
| j+=8; |
| } |
| while (j < radius + y) { |
| sum += weight * src[mirrorTop(j, imageHeight) * imageWidth]; |
| weight -= 1.0f; |
| ++j; |
| } |
| |
| CLPixelType pixel = srcImage[x + y * imageWidth]; |
| float srcVal = dot(RGB, convert_float4(pixel)); |
| float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f); |
| mult = (srcVal + mult) / srcVal; |
| |
| pixel.x = ClampToQuantum(pixel.x * mult); |
| pixel.y = ClampToQuantum(pixel.y * mult); |
| pixel.z = ClampToQuantum(pixel.z * mult); |
| |
| dstImage[x + y * imageWidth] = pixel; |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % M o d u l a t e % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| |
| inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue, |
| float *hue, float *saturation, float *lightness) |
| { |
| float |
| c, |
| tmax, |
| tmin; |
| |
| /* |
| Convert RGB to HSL colorspace. |
| */ |
| tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue)); |
| tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue)); |
| |
| c=tmax-tmin; |
| |
| *lightness=(tmax+tmin)/2.0; |
| if (c <= 0.0) |
| { |
| *hue=0.0; |
| *saturation=0.0; |
| return; |
| } |
| |
| if (tmax == (QuantumScale*red)) |
| { |
| *hue=(QuantumScale*green-QuantumScale*blue)/c; |
| if ((QuantumScale*green) < (QuantumScale*blue)) |
| *hue+=6.0; |
| } |
| else |
| if (tmax == (QuantumScale*green)) |
| *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c; |
| else |
| *hue=4.0+(QuantumScale*red-QuantumScale*green)/c; |
| |
| *hue*=60.0/360.0; |
| if (*lightness <= 0.5) |
| *saturation=c/(2.0*(*lightness)); |
| else |
| *saturation=c/(2.0-2.0*(*lightness)); |
| } |
| |
| inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness, |
| CLQuantum *red,CLQuantum *green,CLQuantum *blue) |
| { |
| float |
| b, |
| c, |
| g, |
| h, |
| tmin, |
| r, |
| x; |
| |
| /* |
| Convert HSL to RGB colorspace. |
| */ |
| h=hue*360.0; |
| if (lightness <= 0.5) |
| c=2.0*lightness*saturation; |
| else |
| c=(2.0-2.0*lightness)*saturation; |
| tmin=lightness-0.5*c; |
| h-=360.0*floor(h/360.0); |
| h/=60.0; |
| x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0)); |
| switch ((int) floor(h) % 6) |
| { |
| case 0: |
| { |
| r=tmin+c; |
| g=tmin+x; |
| b=tmin; |
| break; |
| } |
| case 1: |
| { |
| r=tmin+x; |
| g=tmin+c; |
| b=tmin; |
| break; |
| } |
| case 2: |
| { |
| r=tmin; |
| g=tmin+c; |
| b=tmin+x; |
| break; |
| } |
| case 3: |
| { |
| r=tmin; |
| g=tmin+x; |
| b=tmin+c; |
| break; |
| } |
| case 4: |
| { |
| r=tmin+x; |
| g=tmin; |
| b=tmin+c; |
| break; |
| } |
| case 5: |
| { |
| r=tmin+c; |
| g=tmin; |
| b=tmin+x; |
| break; |
| } |
| default: |
| { |
| r=0.0; |
| g=0.0; |
| b=0.0; |
| } |
| } |
| *red=ClampToQuantum(QuantumRange*r); |
| *green=ClampToQuantum(QuantumRange*g); |
| *blue=ClampToQuantum(QuantumRange*b); |
| } |
| |
| inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, |
| CLQuantum *red,CLQuantum *green,CLQuantum *blue) |
| { |
| float |
| hue, |
| lightness, |
| saturation; |
| |
| /* |
| Increase or decrease color lightness, saturation, or hue. |
| */ |
| ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness); |
| hue+=0.5*(0.01*percent_hue-1.0); |
| while (hue < 0.0) |
| hue+=1.0; |
| while (hue >= 1.0) |
| hue-=1.0; |
| saturation*=0.01*percent_saturation; |
| lightness*=0.01*percent_lightness; |
| ConvertHSLToRGB(hue,saturation,lightness,red,green,blue); |
| } |
| |
| __kernel void Modulate(__global CLPixelType *im, |
| const float percent_brightness, |
| const float percent_hue, |
| const float percent_saturation, |
| const int colorspace) |
| { |
| |
| const int x = get_global_id(0); |
| const int y = get_global_id(1); |
| const int columns = get_global_size(0); |
| const int c = x + y * columns; |
| |
| CLPixelType pixel = im[c]; |
| |
| CLQuantum |
| blue, |
| green, |
| red; |
| |
| red=getRed(pixel); |
| green=getGreen(pixel); |
| blue=getBlue(pixel); |
| |
| switch (colorspace) |
| { |
| case HSLColorspace: |
| default: |
| { |
| ModulateHSL(percent_hue, percent_saturation, percent_brightness, |
| &red, &green, &blue); |
| } |
| |
| } |
| |
| CLPixelType filteredPixel; |
| |
| setRed(&filteredPixel, red); |
| setGreen(&filteredPixel, green); |
| setBlue(&filteredPixel, blue); |
| filteredPixel.w = pixel.w; |
| |
| im[c] = filteredPixel; |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % M o t i o n B l u r % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| __kernel |
| void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output, |
| const unsigned int imageWidth, const unsigned int imageHeight, |
| const __global float *filter, const unsigned int width, const __global int2* offset, |
| const float4 bias, |
| const ChannelType channel, const unsigned int matte) { |
| |
| int2 currentPixel; |
| currentPixel.x = get_global_id(0); |
| currentPixel.y = get_global_id(1); |
| |
| if (currentPixel.x >= imageWidth |
| || currentPixel.y >= imageHeight) |
| return; |
| |
| float4 pixel; |
| pixel.x = (float)bias.x; |
| pixel.y = (float)bias.y; |
| pixel.z = (float)bias.z; |
| pixel.w = (float)bias.w; |
| |
| if (((channel & AlphaChannel) == 0) || (matte == 0)) { |
| |
| for (int i = 0; i < width; i++) { |
| // only support EdgeVirtualPixelMethod through ClampToCanvas |
| // TODO: implement other virtual pixel method |
| int2 samplePixel = currentPixel + offset[i]; |
| samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); |
| samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); |
| CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; |
| |
| pixel.x += (filter[i] * (float)samplePixelValue.x); |
| pixel.y += (filter[i] * (float)samplePixelValue.y); |
| pixel.z += (filter[i] * (float)samplePixelValue.z); |
| pixel.w += (filter[i] * (float)samplePixelValue.w); |
| } |
| |
| CLPixelType outputPixel; |
| outputPixel.x = ClampToQuantum(pixel.x); |
| outputPixel.y = ClampToQuantum(pixel.y); |
| outputPixel.z = ClampToQuantum(pixel.z); |
| outputPixel.w = ClampToQuantum(pixel.w); |
| output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; |
| } |
| else { |
| |
| float gamma = 0.0f; |
| for (int i = 0; i < width; i++) { |
| // only support EdgeVirtualPixelMethod through ClampToCanvas |
| // TODO: implement other virtual pixel method |
| int2 samplePixel = currentPixel + offset[i]; |
| samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); |
| samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); |
| |
| CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; |
| |
| float alpha = QuantumScale*samplePixelValue.w; |
| float k = filter[i]; |
| pixel.x = pixel.x + k * alpha * samplePixelValue.x; |
| pixel.y = pixel.y + k * alpha * samplePixelValue.y; |
| pixel.z = pixel.z + k * alpha * samplePixelValue.z; |
| |
| pixel.w += k * alpha * samplePixelValue.w; |
| |
| gamma+=k*alpha; |
| } |
| gamma = PerceptibleReciprocal(gamma); |
| pixel.xyz = gamma*pixel.xyz; |
| |
| CLPixelType outputPixel; |
| outputPixel.x = ClampToQuantum(pixel.x); |
| outputPixel.y = ClampToQuantum(pixel.y); |
| outputPixel.z = ClampToQuantum(pixel.z); |
| outputPixel.w = ClampToQuantum(pixel.w); |
| output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; |
| } |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % R e s i z e % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| // Based on Box from resize.c |
| float BoxResizeFilter(const float x) |
| { |
| return 1.0f; |
| } |
| ) |
| |
| STRINGIFY( |
| // Based on CubicBC from resize.c |
| float CubicBC(const float x,const __global float* resizeFilterCoefficients) |
| { |
| /* |
| Cubic Filters using B,C determined values: |
| Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter |
| Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears |
| Spline B = 1 C = 0 B-Spline Gaussian approximation |
| Hermite B = 0 C = 0 B-Spline interpolator |
| |
| See paper by Mitchell and Netravali, Reconstruction Filters in Computer |
| Graphics Computer Graphics, Volume 22, Number 4, August 1988 |
| http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/ |
| Mitchell.pdf. |
| |
| Coefficents are determined from B,C values: |
| P0 = ( 6 - 2*B )/6 = coeff[0] |
| P1 = 0 |
| P2 = (-18 +12*B + 6*C )/6 = coeff[1] |
| P3 = ( 12 - 9*B - 6*C )/6 = coeff[2] |
| Q0 = ( 8*B +24*C )/6 = coeff[3] |
| Q1 = ( -12*B -48*C )/6 = coeff[4] |
| Q2 = ( 6*B +30*C )/6 = coeff[5] |
| Q3 = ( - 1*B - 6*C )/6 = coeff[6] |
| |
| which are used to define the filter: |
| |
| P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1 |
| Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2 |
| |
| which ensures function is continuous in value and derivative (slope). |
| */ |
| if (x < 1.0) |
| return(resizeFilterCoefficients[0]+x*(x* |
| (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2]))); |
| if (x < 2.0) |
| return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x* |
| (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6]))); |
| return(0.0); |
| } |
| ) |
| |
| STRINGIFY( |
| float Sinc(const float x) |
| { |
| if (x != 0.0f) |
| { |
| const float alpha=(float) (MagickPI*x); |
| return sinpi(x)/alpha; |
| } |
| return(1.0f); |
| } |
| ) |
| |
| STRINGIFY( |
| float Triangle(const float x) |
| { |
| /* |
| 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or |
| a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function |
| for Sinc(). |
| */ |
| return ((x<1.0f)?(1.0f-x):0.0f); |
| } |
| ) |
| |
| |
| STRINGIFY( |
| float Hann(const float x) |
| { |
| /* |
| Cosine window function: |
| 0.5+0.5*cos(pi*x). |
| */ |
| const float cosine=cos((MagickPI*x)); |
| return(0.5f+0.5f*cosine); |
| } |
| ) |
| |
| STRINGIFY( |
| float Hamming(const float x) |
| { |
| /* |
| Offset cosine window function: |
| .54 + .46 cos(pi x). |
| */ |
| const float cosine=cos((MagickPI*x)); |
| return(0.54f+0.46f*cosine); |
| } |
| ) |
| |
| STRINGIFY( |
| float Blackman(const float x) |
| { |
| /* |
| Blackman: 2nd order cosine windowing function: |
| 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x) |
| |
| Refactored by Chantal Racette and Nicolas Robidoux to one trig call and |
| five flops. |
| */ |
| const float cosine=cos((MagickPI*x)); |
| return(0.34f+cosine*(0.5f+cosine*0.16f)); |
| } |
| ) |
| |
| STRINGIFY( |
| inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients) |
| { |
| switch (filterType) |
| { |
| /* Call Sinc even for SincFast to get better precision on GPU |
| and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/ |
| case SincWeightingFunction: |
| case SincFastWeightingFunction: |
| return Sinc(x); |
| case CubicBCWeightingFunction: |
| return CubicBC(x,filterCoefficients); |
| case BoxWeightingFunction: |
| return BoxResizeFilter(x); |
| case TriangleWeightingFunction: |
| return Triangle(x); |
| case HannWeightingFunction: |
| return Hann(x); |
| case HammingWeightingFunction: |
| return Hamming(x); |
| case BlackmanWeightingFunction: |
| return Blackman(x); |
| |
| default: |
| return 0.0f; |
| } |
| } |
| ) |
| |
| |
| STRINGIFY( |
| inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType |
| , const ResizeWeightingFunctionType resizeWindowType |
| , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x) |
| { |
| float scale; |
| float xBlur = fabs(x/resizeFilterBlur); |
| if (resizeWindowSupport < MagickEpsilon |
| || resizeWindowType == BoxWeightingFunction) |
| { |
| scale = 1.0f; |
| } |
| else |
| { |
| scale = resizeFilterScale; |
| scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients); |
| } |
| float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients); |
| return weight; |
| } |
| |
| ) |
| |
| ; |
| const char *accelerateKernels2 = |
| |
| STRINGIFY( |
| |
| inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { |
| return (numWorkItems/pixelPerWorkgroup); |
| } |
| |
| // returns the index of the pixel for the current workitem to compute. |
| // returns -1 if this workitem doesn't need to participate in any computation |
| inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { |
| const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems); |
| int pixelIndex = itemID/numWorkItemsPerPixel; |
| pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1; |
| return pixelIndex; |
| } |
| |
| ) |
| |
| STRINGIFY( |
| __kernel __attribute__((reqd_work_group_size(256, 1, 1))) |
| void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, |
| const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, |
| const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor, |
| const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, |
| const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, |
| const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, |
| const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, |
| __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) |
| { |
| // calculate the range of resized image pixels computed by this workgroup |
| const unsigned int startX = get_group_id(0)*pixelPerWorkgroup; |
| const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns); |
| const unsigned int actualNumPixelToCompute = stopX - startX; |
| |
| // calculate the range of input image pixels to cache |
| float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f); |
| const float support = MagickMax(scale*resizeFilterSupport,0.5f); |
| scale = PerceptibleReciprocal(scale); |
| |
| const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0)); |
| const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns); |
| |
| // cache the input pixels into local memory |
| const unsigned int y = get_global_id(1); |
| const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y); |
| const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels; |
| event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0); |
| wait_group_events(1, &e); |
| |
| unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; |
| unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; |
| for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) |
| { |
| const unsigned int chunkStartX = startX + chunk*pixelChunkSize; |
| const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX); |
| const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX; |
| |
| // determine which resized pixel computed by this workitem |
| const unsigned int itemID = get_local_id(0); |
| const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0)); |
| |
| const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0)); |
| |
| float4 filteredPixel = (float4)0.0f; |
| float density = 0.0f; |
| float gamma = 0.0f; |
| // -1 means this workitem doesn't participate in the computation |
| if (pixelIndex != -1) |
| { |
| // x coordinated of the resized pixel computed by this workitem |
| const int x = chunkStartX + pixelIndex; |
| |
| // calculate how many steps required for this pixel |
| const float bisect = (x+0.5)/xFactor+MagickEpsilon; |
| const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); |
| const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns); |
| const unsigned int n = stop - start; |
| |
| // calculate how many steps this workitem will contribute |
| unsigned int numStepsPerWorkItem = n / numItems; |
| numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); |
| |
| const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; |
| if (startStep < n) |
| { |
| const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); |
| |
| unsigned int cacheIndex = start+startStep-cacheRangeStartX; |
| for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) |
| { |
| float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, |
| (ResizeWeightingFunctionType) resizeFilterType, |
| (ResizeWeightingFunctionType) resizeWindowType, |
| resizeFilterScale, resizeFilterWindowSupport, |
| resizeFilterBlur, scale*(start + i - bisect + 0.5)); |
| |
| float4 cp = (float4)0.0f; |
| |
| __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels); |
| cp.x = (float) *(p); |
| if (number_channels > 2) |
| { |
| cp.y = (float) *(p + 1); |
| cp.z = (float) *(p + 2); |
| } |
| |
| if (alpha_index != 0) |
| { |
| cp.w = (float) *(p + alpha_index); |
| |
| float alpha = weight * QuantumScale * cp.w; |
| |
| filteredPixel.x += alpha * cp.x; |
| filteredPixel.y += alpha * cp.y; |
| filteredPixel.z += alpha * cp.z; |
| filteredPixel.w += weight * cp.w; |
| gamma += alpha; |
| } |
| else |
| filteredPixel += ((float4) weight)*cp; |
| |
| density += weight; |
| } |
| } |
| } |
| |
| // initialize the accumulators to zero |
| if (itemID < actualNumPixelInThisChunk) { |
| outputPixelCache[itemID] = (float4)0.0f; |
| densityCache[itemID] = 0.0f; |
| if (alpha_index != 0) |
| gammaCache[itemID] = 0.0f; |
| } |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| // accumulatte the filtered pixel value and the density |
| for (unsigned int i = 0; i < numItems; i++) { |
| if (pixelIndex != -1) { |
| if (itemID%numItems == i) { |
| outputPixelCache[pixelIndex]+=filteredPixel; |
| densityCache[pixelIndex]+=density; |
| if (alpha_index != 0) |
| gammaCache[pixelIndex]+=gamma; |
| } |
| } |
| barrier(CLK_LOCAL_MEM_FENCE); |
| } |
| |
| if (itemID < actualNumPixelInThisChunk) |
| { |
| float4 filteredPixel = outputPixelCache[itemID]; |
| |
| float gamma = 0.0f; |
| if (alpha_index != 0) |
| gamma = gammaCache[itemID]; |
| |
| float density = densityCache[itemID]; |
| if ((density != 0.0f) && (density != 1.0f)) |
| { |
| density = PerceptibleReciprocal(density); |
| filteredPixel *= (float4) density; |
| if (alpha_index != 0) |
| gamma *= density; |
| } |
| |
| if (alpha_index != 0) |
| { |
| gamma = PerceptibleReciprocal(gamma); |
| filteredPixel.x *= gamma; |
| filteredPixel.y *= gamma; |
| filteredPixel.z *= gamma; |
| } |
| |
| WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel); |
| } |
| } |
| } |
| ) |
| |
| |
| STRINGIFY( |
| __kernel __attribute__((reqd_work_group_size(1, 256, 1))) |
| void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, |
| const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, |
| const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor, |
| const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, |
| const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, |
| const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, |
| const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, |
| __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) |
| { |
| // calculate the range of resized image pixels computed by this workgroup |
| const unsigned int startY = get_group_id(1)*pixelPerWorkgroup; |
| const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows); |
| const unsigned int actualNumPixelToCompute = stopY - startY; |
| |
| // calculate the range of input image pixels to cache |
| float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f); |
| const float support = MagickMax(scale*resizeFilterSupport,0.5f); |
| scale = PerceptibleReciprocal(scale); |
| |
| const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0)); |
| const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows); |
| |
| // cache the input pixels into local memory |
| const unsigned int x = get_global_id(0); |
| unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY); |
| unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY; |
| unsigned int stride = inputColumns * number_channels; |
| for (unsigned int i = 0; i < number_channels; i++) |
| { |
| event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0); |
| wait_group_events(1,&e); |
| } |
| |
| unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; |
| unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; |
| for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) |
| { |
| const unsigned int chunkStartY = startY + chunk*pixelChunkSize; |
| const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY); |
| const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY; |
| |
| // determine which resized pixel computed by this workitem |
| const unsigned int itemID = get_local_id(1); |
| const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1)); |
| |
| const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1)); |
| |
| float4 filteredPixel = (float4)0.0f; |
| float density = 0.0f; |
| float gamma = 0.0f; |
| // -1 means this workitem doesn't participate in the computation |
| if (pixelIndex != -1) |
| { |
| // x coordinated of the resized pixel computed by this workitem |
| const int y = chunkStartY + pixelIndex; |
| |
| // calculate how many steps required for this pixel |
| const float bisect = (y+0.5)/yFactor+MagickEpsilon; |
| const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); |
| const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows); |
| const unsigned int n = stop - start; |
| |
| // calculate how many steps this workitem will contribute |
| unsigned int numStepsPerWorkItem = n / numItems; |
| numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); |
| |
| const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; |
| if (startStep < n) |
| { |
| const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); |
| |
| unsigned int cacheIndex = start+startStep-cacheRangeStartY; |
| for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) |
| { |
| float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, |
| (ResizeWeightingFunctionType) resizeFilterType, |
| (ResizeWeightingFunctionType) resizeWindowType, |
| resizeFilterScale, resizeFilterWindowSupport, |
| resizeFilterBlur, scale*(start + i - bisect + 0.5)); |
| |
| float4 cp = (float4)0.0f; |
| |
| __local CLQuantum *p = inputImageCache + cacheIndex; |
| cp.x = (float) *(p); |
| if (number_channels > 2) |
| { |
| cp.y = (float) *(p + rangeLength); |
| cp.z = (float) *(p + (rangeLength * 2)); |
| } |
| |
| if (alpha_index != 0) |
| { |
| cp.w = (float) *(p + (rangeLength * alpha_index)); |
| |
| float alpha = weight * QuantumScale * cp.w; |
| |
| filteredPixel.x += alpha * cp.x; |
| filteredPixel.y += alpha * cp.y; |
| filteredPixel.z += alpha * cp.z; |
| filteredPixel.w += weight * cp.w; |
| gamma += alpha; |
| } |
| else |
| filteredPixel += ((float4) weight)*cp; |
| |
| density += weight; |
| } |
| } |
| } |
| |
| // initialize the accumulators to zero |
| if (itemID < actualNumPixelInThisChunk) { |
| outputPixelCache[itemID] = (float4)0.0f; |
| densityCache[itemID] = 0.0f; |
| if (alpha_index != 0) |
| gammaCache[itemID] = 0.0f; |
| } |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| // accumulatte the filtered pixel value and the density |
| for (unsigned int i = 0; i < numItems; i++) { |
| if (pixelIndex != -1) { |
| if (itemID%numItems == i) { |
| outputPixelCache[pixelIndex]+=filteredPixel; |
| densityCache[pixelIndex]+=density; |
| if (alpha_index != 0) |
| gammaCache[pixelIndex]+=gamma; |
| } |
| } |
| barrier(CLK_LOCAL_MEM_FENCE); |
| } |
| |
| if (itemID < actualNumPixelInThisChunk) |
| { |
| float4 filteredPixel = outputPixelCache[itemID]; |
| |
| float gamma = 0.0f; |
| if (alpha_index != 0) |
| gamma = gammaCache[itemID]; |
| |
| float density = densityCache[itemID]; |
| if ((density != 0.0f) && (density != 1.0f)) |
| { |
| density = PerceptibleReciprocal(density); |
| filteredPixel *= (float4) density; |
| if (alpha_index != 0) |
| gamma *= density; |
| } |
| |
| if (alpha_index != 0) |
| { |
| gamma = PerceptibleReciprocal(gamma); |
| filteredPixel.x *= gamma; |
| filteredPixel.y *= gamma; |
| filteredPixel.z *= gamma; |
| } |
| |
| WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel); |
| } |
| } |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % R o t a t i o n a l B l u r % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| __kernel void RotationalBlur(const __global CLQuantum *image, |
| const unsigned int number_channels,const unsigned int channel, |
| const float2 blurCenter,__constant float *cos_theta, |
| __constant float *sin_theta,const unsigned int cossin_theta_size, |
| __global CLQuantum *filteredImage) |
| { |
| const int x = get_global_id(0); |
| const int y = get_global_id(1); |
| const int columns = get_global_size(0); |
| const int rows = get_global_size(1); |
| unsigned int step = 1; |
| float center_x = (float) x - blurCenter.x; |
| float center_y = (float) y - blurCenter.y; |
| float radius = hypot(center_x, center_y); |
| |
| float blur_radius = hypot(blurCenter.x, blurCenter.y); |
| |
| if (radius > MagickEpsilon) |
| { |
| step = (unsigned int) (blur_radius / radius); |
| if (step == 0) |
| step = 1; |
| if (step >= cossin_theta_size) |
| step = cossin_theta_size-1; |
| } |
| |
| float4 result = 0.0f; |
| float normalize = 0.0f; |
| float gamma = 0.0f; |
| |
| for (unsigned int i=0; i<cossin_theta_size; i+=step) |
| { |
| int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns); |
| int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows); |
| |
| float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy); |
| |
| if ((number_channels == 4) || (number_channels == 2)) |
| { |
| float alpha = (float)(QuantumScale*pixel.w); |
| |
| gamma += alpha; |
| |
| result.x += alpha * pixel.x; |
| result.y += alpha * pixel.y; |
| result.z += alpha * pixel.z; |
| result.w += pixel.w; |
| } |
| else |
| result += pixel; |
| |
| normalize += 1.0f; |
| } |
| |
| normalize = PerceptibleReciprocal(normalize); |
| |
| if ((number_channels == 4) || (number_channels == 2)) |
| { |
| gamma = PerceptibleReciprocal(gamma); |
| result.x *= gamma; |
| result.y *= gamma; |
| result.z *= gamma; |
| result.w *= normalize; |
| } |
| else |
| result *= normalize; |
| |
| WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); |
| } |
| ) |
| |
| /* |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| % % |
| % % |
| % % |
| % U n s h a r p M a s k % |
| % % |
| % % |
| % % |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| */ |
| |
| STRINGIFY( |
| __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image, |
| const __global float4 *blurRowData,const unsigned int number_channels, |
| const ChannelType channel,const unsigned int columns, |
| const unsigned int rows,__local float4* cachedData, |
| __local float* cachedFilter,const __global float *filter, |
| const unsigned int width,const float gain, const float threshold, |
| __global CLQuantum *filteredImage) |
| { |
| const unsigned int radius = (width-1)/2; |
| |
| // cache the pixel shared by the workgroup |
| const int groupX = get_group_id(0); |
| const int groupStartY = get_group_id(1)*get_local_size(1) - radius; |
| const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius; |
| |
| if ((groupStartY >= 0) && (groupStopY < rows)) |
| { |
| event_t e = async_work_group_strided_copy(cachedData, |
| blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0); |
| wait_group_events(1,&e); |
| } |
| else |
| { |
| for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) |
| cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX]; |
| |
| barrier(CLK_LOCAL_MEM_FENCE); |
| } |
| // cache the filter as well |
| event_t e = async_work_group_copy(cachedFilter,filter,width,0); |
| wait_group_events(1,&e); |
| |
| // only do the work if this is not a patched item |
| const int cy = get_global_id(1); |
| |
| if (cy < rows) |
| { |
| float4 blurredPixel = (float4) 0.0f; |
| |
| int i = 0; |
| |
| for ( ; i+7 < width; ) |
| { |
| for (int j=0; j < 8; j++, i++) |
| blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)]; |
| } |
| |
| for ( ; i < width; i++) |
| blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)]; |
| |
| float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel); |
| float4 outputPixel = inputImagePixel - blurredPixel; |
| |
| float quantumThreshold = QuantumRange*threshold; |
| |
| int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold); |
| outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask); |
| |
| //write back |
| WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel); |
| } |
| } |
| ) |
| |
| STRINGIFY( |
| __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels, |
| const ChannelType channel,__constant float *filter,const unsigned int width, |
| const unsigned int columns,const unsigned int rows,__local float4 *pixels, |
| const float gain,const float threshold,__global CLQuantum *filteredImage) |
| { |
| const unsigned int x = get_global_id(0); |
| const unsigned int y = get_global_id(1); |
| |
| const unsigned int radius = (width - 1) / 2; |
| |
| int row = y - radius; |
| int baseRow = get_group_id(1) * get_local_size(1) - radius; |
| int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius; |
| |
| while (row < endRow) { |
| int srcy = (row < 0) ? -row : row; // mirror pad |
| srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy; |
| |
| float4 value = 0.0f; |
| |
| int ix = x - radius; |
| int i = 0; |
| |
| while (i + 7 < width) { |
| for (int j = 0; j < 8; ++j) { // unrolled |
| int srcx = ix + j; |
| srcx = (srcx < 0) ? -srcx : srcx; |
| srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; |
| value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); |
| } |
| ix += 8; |
| i += 8; |
| } |
| |
| while (i < width) { |
| int srcx = (ix < 0) ? -ix : ix; // mirror pad |
| srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; |
| value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); |
| ++i; |
| ++ix; |
| } |
| pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value; |
| row += get_local_size(1); |
| } |
| |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| const int px = get_local_id(0); |
| const int py = get_local_id(1); |
| const int prp = get_local_size(0); |
| float4 value = (float4)(0.0f); |
| |
| int i = 0; |
| while (i + 7 < width) { |
| for (int j = 0; j < 8; ++j) // unrolled |
| value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp]; |
| i += 8; |
| } |
| while (i < width) { |
| value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; |
| ++i; |
| } |
| |
| if ((x < columns) && (y < rows)) { |
| float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel); |
| float4 diff = srcPixel - value; |
| |
| float quantumThreshold = QuantumRange*threshold; |
| |
| int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold); |
| value = select(srcPixel + diff * gain, srcPixel, mask); |
| |
| WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value); |
| } |
| } |
| ) |
| |
| STRINGIFY( |
| __kernel __attribute__((reqd_work_group_size(64, 4, 1))) |
| void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage, |
| const unsigned int number_channels,const unsigned int max_channels, |
| const float threshold,const int passes,const unsigned int imageWidth, |
| const unsigned int imageHeight) |
| { |
| const int pad = (1 << (passes - 1)); |
| const int tileSize = 64; |
| const int tileRowPixels = 64; |
| const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 }; |
| |
| CLQuantum stage[48]; // 16 * 3 (we only need 3 channels) |
| |
| local float buffer[64 * 64]; |
| |
| int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0); |
| int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad; |
| |
| for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { |
| int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + |
| (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels; |
| |
| for (int channel = 0; channel < max_channels; ++channel) |
| stage[(i / 4) + (16 * channel)] = srcImage[pos + channel]; |
| } |
| |
| for (int channel = 0; channel < max_channels; ++channel) { |
| // Load LDS |
| for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) |
| buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]); |
| |
| // Process |
| |
| float tmp[16]; |
| float accum[16]; |
| float pixel; |
| |
| for (int i = 0; i < 16; i++) |
| accum[i]=0.0f; |
| |
| for (int pass = 0; pass < passes; ++pass) { |
| const int radius = 1 << pass; |
| const int x = get_local_id(0); |
| const float thresh = threshold * noise[pass]; |
| |
| // Apply horizontal hat |
| for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { |
| const int offset = i * tileRowPixels; |
| if (pass == 0) |
| tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass |
| pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]); |
| barrier(CLK_LOCAL_MEM_FENCE); |
| buffer[x + offset] = pixel; |
| } |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| // Apply vertical hat |
| for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { |
| pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]); |
| float delta = tmp[i / 4] - pixel; |
| tmp[i / 4] = pixel; // hold output in tmp until all workitems are done |
| if (delta < -thresh) |
| delta += thresh; |
| else if (delta > thresh) |
| delta -= thresh; |
| else |
| delta = 0; |
| accum[i / 4] += delta; |
| } |
| barrier(CLK_LOCAL_MEM_FENCE); |
| |
| if (pass < passes - 1) |
| for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) |
| buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass |
| else // last pass |
| for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) |
| accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output |
| barrier(CLK_LOCAL_MEM_FENCE); |
| } |
| |
| for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) |
| stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]); |
| |
| barrier(CLK_LOCAL_MEM_FENCE); |
| } |
| |
| // Write from stage to output |
| |
| if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { |
| for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { |
| if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) { |
| int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels)); |
| for (int channel = 0; channel < max_channels; ++channel) { |
| dstImage[pos + channel] = stage[(i / 4) + (16 * channel)]; |
| } |
| } |
| } |
| } |
| } |
| ) |
| |
| ; |
| |
| #endif // MAGICKCORE_OPENCL_SUPPORT |
| |
| #if defined(__cplusplus) || defined(c_plusplus) |
| } |
| #endif |
| |
| #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H |