Blame - MagickCore/accelerate-private.h - platform/external/ImageMagick

blob: fda8472822eb74d0a417884890dd36a5d1ae6567 [file] [log] [blame]

cristy	e110189	2013-08-18 00:53:18 +0000	[diff] [blame]	1	/*
cristy	fe676ee	2013-11-18 13:03:38 +0000	[diff] [blame]	2	Copyright 1999-2014 ImageMagick Studio LLC, a non-profit organization
cristy	e110189	2013-08-18 00:53:18 +0000	[diff] [blame]	3	dedicated to making software imaging solutions freely available.
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	4
cristy	e110189	2013-08-18 00:53:18 +0000	[diff] [blame]	5	You may not use this file except in compliance with the License.
				6	obtain a copy of the License at
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	7
cristy	e110189	2013-08-18 00:53:18 +0000	[diff] [blame]	8	http://www.imagemagick.org/script/license.php
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	9
cristy	e110189	2013-08-18 00:53:18 +0000	[diff] [blame]	10	Unless required by applicable law or agreed to in writing, software
				11	distributed under the License is distributed on an "AS IS" BASIS,
				12	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	See the License for the specific language governing permissions and
				14	limitations under the License.
				15
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	16	MagickCore private methods for accelerated functions.
cristy	e110189	2013-08-18 00:53:18 +0000	[diff] [blame]	17	*/
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	18
cristy	e110189	2013-08-18 00:53:18 +0000	[diff] [blame]	19	#ifndef _MAGICKCORE_ACCELERATE_PRIVATE_H
				20	#define _MAGICKCORE_ACCELERATE_PRIVATE_H
				21
				22	#if defined(__cplusplus) \|\| defined(c_plusplus)
				23	extern "C" {
				24	#endif
				25
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	26
				27	#if defined(MAGICKCORE_OPENCL_SUPPORT)
				28
				29	#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
				30	#define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
				31	#define OPENCL_ELSE() "\n #""else " " \n"
				32	#define OPENCL_ENDIF() "\n #""endif " " \n"
				33	#define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
				34	#define STRINGIFY(...) #__VA_ARGS__ "\n"
				35
				36	typedef struct _FloatPixelPacket
				37	{
cristy	a22457d	2013-12-07 14:03:06 +0000	[diff] [blame]	38	#ifdef MAGICK_PIXEL_RGBA
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	39	MagickRealType
				40	red,
				41	green,
				42	blue,
cristy	a22457d	2013-12-07 14:03:06 +0000	[diff] [blame]	43	opacity;
				44	#endif
				45	#ifdef MAGICK_PIXEL_BGRA
				46	MagickRealType
				47	blue,
				48	green,
				49	red,
				50	opacity;
				51	#endif
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	52	} FloatPixelPacket;
				53
				54	const char* accelerateKernels =
				55	STRINGIFY(
				56	typedef enum
				57	{
				58	UndefinedChannel,
				59	RedChannel = 0x0001,
				60	GrayChannel = 0x0001,
				61	CyanChannel = 0x0001,
				62	GreenChannel = 0x0002,
				63	MagentaChannel = 0x0002,
				64	BlueChannel = 0x0004,
				65	YellowChannel = 0x0004,
				66	AlphaChannel = 0x0008,
				67	OpacityChannel = 0x0008,
				68	MatteChannel = 0x0008, /* deprecated */
				69	BlackChannel = 0x0020,
				70	IndexChannel = 0x0020,
				71	CompositeChannels = 0x002F,
				72	AllChannels = 0x7ffffff,
				73	/*
				74	Special purpose channel types.
				75	*/
				76	TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
				77	RGBChannels = 0x0080, /* set alpha from grayscale mask in RGB */
				78	GrayChannels = 0x0080,
				79	SyncChannels = 0x0100, /* channels should be modified equally */
				80	DefaultChannels = ((AllChannels \| SyncChannels) &~ OpacityChannel)
				81	} ChannelType;
				82	)
				83
				84	OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
				85
				86	STRINGIFY(
				87	inline CLQuantum ScaleCharToQuantum(const unsigned char value)
				88	{
				89	return((CLQuantum) value);
				90	}
				91	)
				92
				93	OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
				94
				95	STRINGIFY(
				96	inline CLQuantum ScaleCharToQuantum(const unsigned char value)
				97	{
				98	return((CLQuantum) (257.0f*value));
				99	}
				100	)
				101
				102	OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
				103
				104	STRINGIFY(
				105	inline CLQuantum ScaleCharToQuantum(const unsigned char value)
				106	{
				107	return((Quantum) (16843009.0*value));
				108	}
				109	)
				110
				111	OPENCL_ENDIF()
				112
				113
				114	STRINGIFY(
				115	inline int ClampToCanvas(const int offset,const int range)
				116	{
				117	return clamp(offset, (int)0, range-1);
				118	}
				119	)
				120
				121	STRINGIFY(
				122	inline int ClampToCanvasWithHalo(const int offset,const int range, const int edge, const int section)
				123	{
				124	return clamp(offset, section?(int)(0-edge):(int)0, section?(range-1):(range-1+edge));
				125	}
				126	)
				127
				128	STRINGIFY(
				129	inline CLQuantum ClampToQuantum(const float value)
				130	{
				131	return (CLQuantum) (clamp(value, 0.0f, (float) QuantumRange) + 0.5f);
				132	}
				133	)
				134
				135	STRINGIFY(
				136	inline uint ScaleQuantumToMap(CLQuantum value)
				137	{
				138	if (value >= (CLQuantum) MaxMap)
				139	return ((uint)MaxMap);
				140	else
				141	return ((uint)value);
				142	}
				143	)
				144
				145	STRINGIFY(
				146	inline float PerceptibleReciprocal(const float x)
				147	{
				148	float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
				149	return((signx) >= MagickEpsilon ? (float) 1.0/x : sign((float) 1.0/MagickEpsilon));
				150	}
				151	)
				152
				153	OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
				154
				155	STRINGIFY(
				156
				157	inline CLQuantum getBlue(CLPixelType p) { return p.x; }
				158	inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
				159	inline float getBlueF4(float4 p) { return p.x; }
				160	inline void setBlueF4(float4* p, float value) { (*p).x = value; }
				161
				162	inline CLQuantum getGreen(CLPixelType p) { return p.y; }
				163	inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
				164	inline float getGreenF4(float4 p) { return p.y; }
				165	inline void setGreenF4(float4* p, float value) { (*p).y = value; }
				166
				167	inline CLQuantum getRed(CLPixelType p) { return p.z; }
				168	inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
				169	inline float getRedF4(float4 p) { return p.z; }
				170	inline void setRedF4(float4* p, float value) { (*p).z = value; }
				171
				172	inline CLQuantum getOpacity(CLPixelType p) { return p.w; }
				173	inline void setOpacity(CLPixelType* p, CLQuantum value) { (*p).w = value; }
				174	inline float getOpacityF4(float4 p) { return p.w; }
				175	inline void setOpacityF4(float4* p, float value) { (*p).w = value; }
				176
				177	inline float GetPixelIntensity(int colorspace, CLPixelType p)
				178	{
				179	// this is for default intensity and sRGB (not RGB) color space
				180	float red = getRed(p);
				181	float green = getGreen(p);
				182	float blue = getBlue(p);
				183
				184	if (colorspace == 0)
				185	return 0.212656red+0.715158green+0.072186*blue;
				186	else
				187	{
				188	// need encode gamma
				189	}
				190	return 0.0;
				191	}
				192	)
				193
				194	STRINGIFY(
				195	__kernel
cristy	a22457d	2013-12-07 14:03:06 +0000	[diff] [blame]	196	void ConvolveOptimized(const __global CLPixelType input, __global CLPixelType output,
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	197	const unsigned int imageWidth, const unsigned int imageHeight,
				198	__constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
				199	const uint matte, const ChannelType channel, __local CLPixelType pixelLocalCache, __local float filterCache) {
				200
				201	int2 blockID;
				202	blockID.x = get_group_id(0);
				203	blockID.y = get_group_id(1);
				204
				205	// image area processed by this workgroup
				206	int2 imageAreaOrg;
				207	imageAreaOrg.x = blockID.x * get_local_size(0);
				208	imageAreaOrg.y = blockID.y * get_local_size(1);
				209
				210	int2 midFilterDimen;
				211	midFilterDimen.x = (filterWidth-1)/2;
				212	midFilterDimen.y = (filterHeight-1)/2;
				213
				214	int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
				215
				216	// dimension of the local cache
				217	int2 cachedAreaDimen;
				218	cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
				219	cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
				220
				221	// cache the pixels accessed by this workgroup in local memory
				222	int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
				223	int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
				224	int groupSize = get_local_size(0) * get_local_size(1);
				225	for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
				226
				227	int2 cachedAreaIndex;
				228	cachedAreaIndex.x = i % cachedAreaDimen.x;
				229	cachedAreaIndex.y = i / cachedAreaDimen.x;
				230
				231	int2 imagePixelIndex;
				232	imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
				233
				234	// only support EdgeVirtualPixelMethod through ClampToCanvas
				235	// TODO: implement other virtual pixel method
				236	imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
				237	imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
				238
				239	pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
				240	}
				241
				242	// cache the filter
				243	for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
				244	filterCache[i] = filter[i];
				245	}
				246	barrier(CLK_LOCAL_MEM_FENCE);
				247
				248
				249	int2 imageIndex;
				250	imageIndex.x = imageAreaOrg.x + get_local_id(0);
				251	imageIndex.y = imageAreaOrg.y + get_local_id(1);
				252
				253	// if out-of-range, stops here and quit
				254	if (imageIndex.x >= imageWidth
				255	\|\| imageIndex.y >= imageHeight) {
				256	return;
				257	}
				258
				259	int filterIndex = 0;
				260	float4 sum = (float4)0.0f;
				261	float gamma = 0.0f;
				262	if (((channel & OpacityChannel) == 0) \|\| (matte == 0)) {
				263	int cacheIndexY = get_local_id(1);
				264	for (int j = 0; j < filterHeight; j++) {
				265	int cacheIndexX = get_local_id(0);
				266	for (int i = 0; i < filterWidth; i++) {
				267	CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
				268	float f = filterCache[filterIndex];
				269
				270	sum.x += f * p.x;
				271	sum.y += f * p.y;
				272	sum.z += f * p.z;
				273	sum.w += f * p.w;
				274
				275	gamma += f;
				276	filterIndex++;
				277	cacheIndexX++;
				278	}
				279	cacheIndexY++;
				280	}
				281	}
				282	else {
				283	int cacheIndexY = get_local_id(1);
				284	for (int j = 0; j < filterHeight; j++) {
				285	int cacheIndexX = get_local_id(0);
				286	for (int i = 0; i < filterWidth; i++) {
				287
				288	CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
				289	float alpha = QuantumScale*(QuantumRange-p.w);
				290	float f = filterCache[filterIndex];
				291	float g = alpha * f;
				292
				293	sum.x += g*p.x;
				294	sum.y += g*p.y;
				295	sum.z += g*p.z;
				296	sum.w += f*p.w;
				297
				298	gamma += g;
				299	filterIndex++;
				300	cacheIndexX++;
				301	}
				302	cacheIndexY++;
				303	}
				304	gamma = PerceptibleReciprocal(gamma);
				305	sum.xyz = gamma*sum.xyz;
				306	}
				307	CLPixelType outputPixel;
				308	outputPixel.x = ClampToQuantum(sum.x);
				309	outputPixel.y = ClampToQuantum(sum.y);
				310	outputPixel.z = ClampToQuantum(sum.z);
				311	outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
				312
				313	output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
				314	}
				315	)
				316
				317	STRINGIFY(
cristy	a22457d	2013-12-07 14:03:06 +0000	[diff] [blame]	318	__kernel
				319	void Convolve(const __global CLPixelType input, __global CLPixelType output,
				320	__constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
				321	const uint matte, const ChannelType channel) {
				322
				323	int2 imageIndex;
				324	imageIndex.x = get_global_id(0);
				325	imageIndex.y = get_global_id(1);
				326
				327	unsigned int imageWidth = get_global_size(0);
				328	unsigned int imageHeight = get_global_size(1);
				329
				330	if (imageIndex.x >= imageWidth
				331	\|\| imageIndex.y >= imageHeight)
				332	return;
				333
				334	int2 midFilterDimen;
				335	midFilterDimen.x = (filterWidth-1)/2;
				336	midFilterDimen.y = (filterHeight-1)/2;
				337
				338	int filterIndex = 0;
				339	float4 sum = (float4)0.0f;
				340	float gamma = 0.0f;
				341	if (((channel & OpacityChannel) == 0) \|\| (matte == 0)) {
				342	for (int j = 0; j < filterHeight; j++) {
				343	int2 inputPixelIndex;
				344	inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
				345	inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
				346	for (int i = 0; i < filterWidth; i++) {
				347	inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
				348	inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
				349
				350	CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
				351	float f = filter[filterIndex];
				352
				353	sum.x += f * p.x;
				354	sum.y += f * p.y;
				355	sum.z += f * p.z;
				356	sum.w += f * p.w;
				357
				358	gamma += f;
				359
				360	filterIndex++;
				361	}
				362	}
				363	}
				364	else {
				365
				366	for (int j = 0; j < filterHeight; j++) {
				367	int2 inputPixelIndex;
				368	inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
				369	inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
				370	for (int i = 0; i < filterWidth; i++) {
				371	inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
				372	inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
				373
				374	CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
				375	float alpha = QuantumScale*(QuantumRange-p.w);
				376	float f = filter[filterIndex];
				377	float g = alpha * f;
				378
				379	sum.x += g*p.x;
				380	sum.y += g*p.y;
				381	sum.z += g*p.z;
				382	sum.w += f*p.w;
				383
				384	gamma += g;
				385
				386
				387	filterIndex++;
				388	}
				389	}
				390	gamma = PerceptibleReciprocal(gamma);
				391	sum.xyz = gamma*sum.xyz;
				392	}
				393
				394	CLPixelType outputPixel;
				395	outputPixel.x = ClampToQuantum(sum.x);
				396	outputPixel.y = ClampToQuantum(sum.y);
				397	outputPixel.z = ClampToQuantum(sum.z);
				398	outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
				399
				400	output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
				401	}
				402	)
				403
				404	STRINGIFY(
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	405	typedef enum
				406	{
				407	UndefinedFunction,
				408	PolynomialFunction,
				409	SinusoidFunction,
				410	ArcsinFunction,
				411	ArctanFunction
				412	} MagickFunction;
				413	)
				414
				415	STRINGIFY(
				416
				417	/*
				418	apply FunctionImageChannel(braightness-contrast)
				419	*/
				420	CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
				421	const unsigned int number_parameters,
				422	__constant float *parameters)
				423	{
				424	float4 result = (float4) 0.0f;
				425	switch (function)
				426	{
				427	case PolynomialFunction:
				428	{
				429	for (unsigned int i=0; i < number_parameters; i++)
				430	result = resultQuantumScaleconvert_float4(pixel) + parameters[i];
				431	result *= QuantumRange;
				432	break;
				433	}
				434	case SinusoidFunction:
				435	{
				436	float freq,phase,ampl,bias;
				437	freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
				438	phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
				439	ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
				440	bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
				441	result = QuantumRange(amplsin(2.0fMagickPI
				442	(freqQuantumScaleconvert_float4(pixel) + phase/360.0f)) + bias);
				443	break;
				444	}
				445	case ArcsinFunction:
				446	{
				447	float width,range,center,bias;
				448	width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
				449	center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
				450	range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
				451	bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
				452	result = 2.0f/width(QuantumScaleconvert_float4(pixel) - center);
				453	result = range/MagickPI*asin(result)+bias;
				454	result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
				455	result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
				456	result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
				457	result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
				458	result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
				459	result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
				460	result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
				461	result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
				462
				463	result *= QuantumRange;
				464	break;
				465	}
				466	case ArctanFunction:
				467	{
				468	float slope,range,center,bias;
				469	slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
				470	center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
				471	range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
				472	bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
				473	result = MagickPIslope(QuantumScale*convert_float4(pixel)-center);
				474	result = QuantumRange(range/MagickPIatan(result) + bias);
				475	break;
				476	}
				477	case UndefinedFunction:
				478	break;
				479	}
				480	return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
				481	ClampToQuantum(result.z), ClampToQuantum(result.w));
				482	}
				483	)
				484
				485	STRINGIFY(
				486	/*
				487	Improve brightness / contrast of the image
				488	channel : define which channel is improved
				489	function : the function called to enchance the brightness contrast
				490	number_parameters : numbers of parameters
				491	parameters : the parameter
				492	*/
				493	__kernel void FunctionImage(__global CLPixelType *im,
				494	const ChannelType channel, const MagickFunction function,
				495	const unsigned int number_parameters, __constant float *parameters)
				496	{
				497	const int x = get_global_id(0);
				498	const int y = get_global_id(1);
				499	const int columns = get_global_size(0);
				500	const int c = x + y * columns;
				501	im[c] = ApplyFunction(im[c], function, number_parameters, parameters);
				502	}
				503	)
				504
				505	STRINGIFY(
				506	/*
				507	*/
				508	__kernel void Equalize(__global CLPixelType * restrict im,
				509	const ChannelType channel,
				510	__global CLPixelType * restrict equalize_map,
				511	const float4 white, const float4 black)
				512	{
				513	const int x = get_global_id(0);
				514	const int y = get_global_id(1);
				515	const int columns = get_global_size(0);
				516	const int c = x + y * columns;
				517
				518	uint ePos;
				519	CLPixelType oValue, eValue;
				520	CLQuantum red, green, blue, opacity;
				521
				522	//read from global
				523	oValue=im[c];
				524
				525	if ((channel & SyncChannels) != 0)
				526	{
				527	if (getRedF4(white) != getRedF4(black))
				528	{
				529	ePos = ScaleQuantumToMap(getRed(oValue));
				530	eValue = equalize_map[ePos];
				531	red = getRed(eValue);
				532	ePos = ScaleQuantumToMap(getGreen(oValue));
				533	eValue = equalize_map[ePos];
				534	green = getRed(eValue);
				535	ePos = ScaleQuantumToMap(getBlue(oValue));
				536	eValue = equalize_map[ePos];
				537	blue = getRed(eValue);
				538	ePos = ScaleQuantumToMap(getOpacity(oValue));
				539	eValue = equalize_map[ePos];
				540	opacity = getRed(eValue);
				541
				542	//write back
				543	im[c]=(CLPixelType)(blue, green, red, opacity);
				544	}
				545
				546	}
				547
				548	// for equalizing, we always need all channels?
				549	// otherwise something more
				550
				551	}
				552	)
				553
				554	STRINGIFY(
				555	/*
				556	*/
				557	__kernel void Histogram(__global CLPixelType * restrict im,
				558	const ChannelType channel, const int colorspace,
				559	__global uint4 * restrict histogram)
				560	{
				561	const int x = get_global_id(0);
				562	const int y = get_global_id(1);
				563	const int columns = get_global_size(0);
				564	const int c = x + y * columns;
				565	if ((channel & SyncChannels) != 0)
				566	{
				567	float intensity = GetPixelIntensity(colorspace,im[c]);
				568	uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
				569	atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
				570	}
				571	else
				572	{
				573	// for equalizing, we always need all channels?
				574	// otherwise something more
				575	}
				576	}
				577	)
				578
				579	STRINGIFY(
				580	/*
				581	Reduce image noise and reduce detail levels by row
				582	im: input pixels filtered_in filtered_im: output pixels
				583	filter : convolve kernel width: convolve kernel size
				584	channel : define which channel is blured
				585	is_RGBA_BGRA : define the input is RGBA or BGRA
				586	*/
				587	__kernel void BlurRow(__global CLPixelType im, __global float4 filtered_im,
				588	const ChannelType channel, __constant float *filter,
				589	const unsigned int width,
				590	const unsigned int imageColumns, const unsigned int imageRows,
				591	__local CLPixelType *temp)
				592	{
				593	const int x = get_global_id(0);
				594	const int y = get_global_id(1);
				595
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	596	const int columns = imageColumns;
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	597
				598	const unsigned int radius = (width-1)/2;
				599	const int wsize = get_local_size(0);
				600	const unsigned int loadSize = wsize+width;
				601
				602	//load chunk only for now
				603	//event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
				604	//wait_group_events(1,&e);
				605
				606	//parallel load and clamp
				607	/*
				608	int count = 0;
				609	for (int i=0; i < loadSize; i=i+wsize)
				610	{
				611	int currentX = x + wsize*(count++);
				612
				613	int localId = get_local_id(0);
				614
				615	if ((localId+i) > loadSize)
				616	break;
				617
				618	temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
				619
				620	if (y==0 && get_group_id(0) == 0)
				621	{
				622	printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
				623	}
				624	}
				625	*/
				626
				627	//group coordinate
				628	const int groupX=get_local_size(0)*get_group_id(0);
				629	const int groupY=get_local_size(1)*get_group_id(1);
				630
				631	//parallel load and clamp
				632	for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
				633	{
				634	//int cx = ClampToCanvas(groupX+i, columns);
				635	temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
				636
				637	if (0 && y==0 && get_group_id(1) == 0)
				638	{
				639	printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
				640	}
				641	}
				642
				643	// barrier
				644	barrier(CLK_LOCAL_MEM_FENCE);
				645
				646	// only do the work if this is not a patched item
				647	if (get_global_id(0) < columns)
				648	{
				649	// compute
				650	float4 result = (float4) 0;
				651
				652	int i = 0;
				653
				654	\n #ifndef UFACTOR \n
				655	\n #define UFACTOR 8 \n
				656	\n #endif \n
				657
				658	for ( ; i+UFACTOR < width; )
				659	{
				660	\n #pragma unroll UFACTOR\n
				661	for (int j=0; j < UFACTOR; j++, i++)
				662	{
				663	result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
				664	}
				665	}
				666
				667	for ( ; i < width; i++)
				668	{
				669	result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
				670	}
				671
				672	result.x = ClampToQuantum(result.x);
				673	result.y = ClampToQuantum(result.y);
				674	result.z = ClampToQuantum(result.z);
				675	result.w = ClampToQuantum(result.w);
				676
				677	// write back to global
				678	filtered_im[y*columns+x] = result;
				679	}
				680	}
				681	)
				682
				683	STRINGIFY(
				684	/*
				685	Reduce image noise and reduce detail levels by row
				686	im: input pixels filtered_in filtered_im: output pixels
				687	filter : convolve kernel width: convolve kernel size
				688	channel : define which channel is blured
				689	is_RGBA_BGRA : define the input is RGBA or BGRA
				690	*/
				691	__kernel void BlurRowSection(__global CLPixelType im, __global float4 filtered_im,
				692	const ChannelType channel, __constant float *filter,
				693	const unsigned int width,
				694	const unsigned int imageColumns, const unsigned int imageRows,
				695	__local CLPixelType *temp,
				696	const unsigned int offsetRows, const unsigned int section)
				697	{
				698	const int x = get_global_id(0);
				699	const int y = get_global_id(1);
				700
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	701	const int columns = imageColumns;
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	702
				703	const unsigned int radius = (width-1)/2;
				704	const int wsize = get_local_size(0);
				705	const unsigned int loadSize = wsize+width;
				706
				707	//group coordinate
				708	const int groupX=get_local_size(0)*get_group_id(0);
				709	const int groupY=get_local_size(1)*get_group_id(1);
				710
				711	//offset the input data, assuming section is 0, 1
				712	im += imageColumns * (offsetRows - radius * section);
				713
				714	//parallel load and clamp
				715	for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
				716	{
				717	//int cx = ClampToCanvas(groupX+i, columns);
				718	temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
				719
				720	if (0 && y==0 && get_group_id(1) == 0)
				721	{
				722	printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
				723	}
				724	}
				725
				726	// barrier
				727	barrier(CLK_LOCAL_MEM_FENCE);
				728
				729	// only do the work if this is not a patched item
				730	if (get_global_id(0) < columns)
				731	{
				732	// compute
				733	float4 result = (float4) 0;
				734
				735	int i = 0;
				736
				737	\n #ifndef UFACTOR \n
				738	\n #define UFACTOR 8 \n
				739	\n #endif \n
				740
				741	for ( ; i+UFACTOR < width; )
				742	{
				743	\n #pragma unroll UFACTOR\n
				744	for (int j=0; j < UFACTOR; j++, i++)
				745	{
				746	result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
				747	}
				748	}
				749
				750	for ( ; i < width; i++)
				751	{
				752	result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
				753	}
				754
				755	result.x = ClampToQuantum(result.x);
				756	result.y = ClampToQuantum(result.y);
				757	result.z = ClampToQuantum(result.z);
				758	result.w = ClampToQuantum(result.w);
				759
				760	// write back to global
				761	filtered_im[y*columns+x] = result;
				762	}
				763
				764	}
				765	)
				766
				767	STRINGIFY(
				768	/*
				769	Reduce image noise and reduce detail levels by line
				770	im: input pixels filtered_in filtered_im: output pixels
				771	filter : convolve kernel width: convolve kernel size
				772	channel : define which channel is blured\
				773	is_RGBA_BGRA : define the input is RGBA or BGRA
				774	*/
				775	__kernel void BlurColumn(const __global float4 blurRowData, __global CLPixelType filtered_im,
				776	const ChannelType channel, __constant float *filter,
				777	const unsigned int width,
				778	const unsigned int imageColumns, const unsigned int imageRows,
				779	__local float4 *temp)
				780	{
				781	const int x = get_global_id(0);
				782	const int y = get_global_id(1);
				783
				784	//const int columns = get_global_size(0);
				785	//const int rows = get_global_size(1);
				786	const int columns = imageColumns;
				787	const int rows = imageRows;
				788
				789	unsigned int radius = (width-1)/2;
				790	const int wsize = get_local_size(1);
				791	const unsigned int loadSize = wsize+width;
				792
				793	//group coordinate
				794	const int groupX=get_local_size(0)*get_group_id(0);
				795	const int groupY=get_local_size(1)*get_group_id(1);
				796	//notice that get_local_size(0) is 1, so
				797	//groupX=get_group_id(0);
				798
				799	//parallel load and clamp
				800	for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
				801	{
				802	temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
				803	}
				804
				805	// barrier
				806	barrier(CLK_LOCAL_MEM_FENCE);
				807
				808	// only do the work if this is not a patched item
				809	if (get_global_id(1) < rows)
				810	{
				811	// compute
				812	float4 result = (float4) 0;
				813
				814	int i = 0;
				815
				816	\n #ifndef UFACTOR \n
				817	\n #define UFACTOR 8 \n
				818	\n #endif \n
				819
				820	for ( ; i+UFACTOR < width; )
				821	{
				822	\n #pragma unroll UFACTOR \n
				823	for (int j=0; j < UFACTOR; j++, i++)
				824	{
				825	result+=filter[i]*temp[i+get_local_id(1)];
				826	}
				827	}
				828
				829	for ( ; i < width; i++)
				830	{
				831	result+=filter[i]*temp[i+get_local_id(1)];
				832	}
				833
				834	result.x = ClampToQuantum(result.x);
				835	result.y = ClampToQuantum(result.y);
				836	result.z = ClampToQuantum(result.z);
				837	result.w = ClampToQuantum(result.w);
				838
				839	// write back to global
				840	filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
				841	}
				842
				843	}
				844	)
				845
				846
				847	STRINGIFY(
				848	/*
				849	Reduce image noise and reduce detail levels by line
				850	im: input pixels filtered_in filtered_im: output pixels
				851	filter : convolve kernel width: convolve kernel size
				852	channel : define which channel is blured\
				853	is_RGBA_BGRA : define the input is RGBA or BGRA
				854	*/
				855	__kernel void BlurColumnSection(const __global float4 blurRowData, __global CLPixelType filtered_im,
				856	const ChannelType channel, __constant float *filter,
				857	const unsigned int width,
				858	const unsigned int imageColumns, const unsigned int imageRows,
				859	__local float4 *temp,
				860	const unsigned int offsetRows, const unsigned int section)
				861	{
				862	const int x = get_global_id(0);
				863	const int y = get_global_id(1);
				864
				865	//const int columns = get_global_size(0);
				866	//const int rows = get_global_size(1);
				867	const int columns = imageColumns;
				868	const int rows = imageRows;
				869
				870	unsigned int radius = (width-1)/2;
				871	const int wsize = get_local_size(1);
				872	const unsigned int loadSize = wsize+width;
				873
				874	//group coordinate
				875	const int groupX=get_local_size(0)*get_group_id(0);
				876	const int groupY=get_local_size(1)*get_group_id(1);
				877	//notice that get_local_size(0) is 1, so
				878	//groupX=get_group_id(0);
				879
				880	// offset the input data
				881	blurRowData += imageColumns * radius * section;
				882
				883	//parallel load and clamp
				884	for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
				885	{
				886	int pos = ClampToCanvasWithHalo(i+groupY-radius, rows, radius, section) * columns + groupX;
				887	temp[i] = *(blurRowData+pos);
				888	}
				889
				890	// barrier
				891	barrier(CLK_LOCAL_MEM_FENCE);
				892
				893	// only do the work if this is not a patched item
				894	if (get_global_id(1) < rows)
				895	{
				896	// compute
				897	float4 result = (float4) 0;
				898
				899	int i = 0;
				900
				901	\n #ifndef UFACTOR \n
				902	\n #define UFACTOR 8 \n
				903	\n #endif \n
				904
				905	for ( ; i+UFACTOR < width; )
				906	{
				907	\n #pragma unroll UFACTOR \n
				908	for (int j=0; j < UFACTOR; j++, i++)
				909	{
				910	result+=filter[i]*temp[i+get_local_id(1)];
				911	}
				912	}
				913	for ( ; i < width; i++)
				914	{
				915	result+=filter[i]*temp[i+get_local_id(1)];
				916	}
				917
				918	result.x = ClampToQuantum(result.x);
				919	result.y = ClampToQuantum(result.y);
				920	result.z = ClampToQuantum(result.z);
				921	result.w = ClampToQuantum(result.w);
				922
				923	// offset the output data
				924	filtered_im += imageColumns * offsetRows;
				925
				926	// write back to global
				927	filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
				928	}
				929
				930	}
				931	)
				932
				933
				934	STRINGIFY(
				935	__kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage,
				936	const __global float4 blurRowData, __global CLPixelType filtered_im,
				937	const unsigned int imageColumns, const unsigned int imageRows,
				938	__local float4* cachedData, __local float* cachedFilter,
				939	const ChannelType channel, const __global float *filter, const unsigned int width,
				940	const float gain, const float threshold)
				941	{
				942	const unsigned int radius = (width-1)/2;
				943
				944	// cache the pixel shared by the workgroup
				945	const int groupX = get_group_id(0);
				946	const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
				947	const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
				948
				949	if (groupStartY >= 0
				950	&& groupStopY < imageRows) {
				951	event_t e = async_work_group_strided_copy(cachedData
				952	,blurRowData+groupStartY*imageColumns+groupX
				953	,groupStopY-groupStartY,imageColumns,0);
				954	wait_group_events(1,&e);
				955	}
				956	else {
				957	for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
				958	cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
				959	}
				960	barrier(CLK_LOCAL_MEM_FENCE);
				961	}
				962	// cache the filter as well
				963	event_t e = async_work_group_copy(cachedFilter,filter,width,0);
				964	wait_group_events(1,&e);
				965
				966	// only do the work if this is not a patched item
				967	//const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
				968	const int cy = get_global_id(1);
				969
				970	if (cy < imageRows) {
				971	float4 blurredPixel = (float4) 0.0f;
				972
				973	int i = 0;
				974
				975	\n #ifndef UFACTOR \n
				976	\n #define UFACTOR 8 \n
				977	\n #endif \n
				978
				979	for ( ; i+UFACTOR < width; )
				980	{
				981	\n #pragma unroll UFACTOR \n
				982	for (int j=0; j < UFACTOR; j++, i++)
				983	{
				984	blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
				985	}
				986	}
				987
				988	for ( ; i < width; i++)
				989	{
				990	blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
				991	}
				992
				993	blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
				994	,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
				995
				996	float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
				997	float4 outputPixel = inputImagePixel - blurredPixel;
				998
				999	float quantumThreshold = QuantumRange*threshold;
				1000
				1001	int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
				1002	outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
				1003
				1004	//write back
				1005	filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
				1006	,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
				1007
				1008	}
				1009	}
				1010
				1011	__kernel void UnsharpMaskBlurColumnSection(const __global CLPixelType* inputImage,
				1012	const __global float4 blurRowData, __global CLPixelType filtered_im,
				1013	const unsigned int imageColumns, const unsigned int imageRows,
				1014	__local float4* cachedData, __local float* cachedFilter,
				1015	const ChannelType channel, const __global float *filter, const unsigned int width,
				1016	const float gain, const float threshold,
				1017	const unsigned int offsetRows, const unsigned int section)
				1018	{
				1019	const unsigned int radius = (width-1)/2;
				1020
				1021	// cache the pixel shared by the workgroup
				1022	const int groupX = get_group_id(0);
				1023	const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
				1024	const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
				1025
				1026	// offset the input data
				1027	blurRowData += imageColumns * radius * section;
				1028
				1029	if (groupStartY >= 0
				1030	&& groupStopY < imageRows) {
				1031	event_t e = async_work_group_strided_copy(cachedData
				1032	,blurRowData+groupStartY*imageColumns+groupX
				1033	,groupStopY-groupStartY,imageColumns,0);
				1034	wait_group_events(1,&e);
				1035	}
				1036	else {
				1037	for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
				1038	int pos = ClampToCanvasWithHalo(groupStartY+i,imageRows, radius, section)*imageColumns+ groupX;
				1039	cachedData[i] = *(blurRowData + pos);
				1040	}
				1041	barrier(CLK_LOCAL_MEM_FENCE);
				1042	}
				1043	// cache the filter as well
				1044	event_t e = async_work_group_copy(cachedFilter,filter,width,0);
				1045	wait_group_events(1,&e);
				1046
				1047	// only do the work if this is not a patched item
				1048	//const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
				1049	const int cy = get_global_id(1);
				1050
				1051	if (cy < imageRows) {
				1052	float4 blurredPixel = (float4) 0.0f;
				1053
				1054	int i = 0;
				1055
				1056	\n #ifndef UFACTOR \n
				1057	\n #define UFACTOR 8 \n
				1058	\n #endif \n
				1059
				1060	for ( ; i+UFACTOR < width; )
				1061	{
				1062	\n #pragma unroll UFACTOR \n
				1063	for (int j=0; j < UFACTOR; j++, i++)
				1064	{
				1065	blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
				1066	}
				1067	}
				1068
				1069	for ( ; i < width; i++)
				1070	{
				1071	blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
				1072	}
				1073
				1074	blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
				1075	,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
				1076
				1077	// offset the output data
				1078	inputImage += imageColumns * offsetRows;
				1079	filtered_im += imageColumns * offsetRows;
				1080
				1081	float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
				1082	float4 outputPixel = inputImagePixel - blurredPixel;
				1083
				1084	float quantumThreshold = QuantumRange*threshold;
				1085
				1086	int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
				1087	outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
				1088
				1089	//write back
				1090	filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
				1091	,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
				1092
				1093	}
				1094
				1095	}
				1096	)
				1097
				1098
				1099
				1100	STRINGIFY(
				1101
				1102	__kernel void HullPass1(const __global CLPixelType inputImage, __global CLPixelType outputImage
				1103	, const unsigned int imageWidth, const unsigned int imageHeight
				1104	, const int2 offset, const int polarity, const int matte) {
				1105
				1106	int x = get_global_id(0);
				1107	int y = get_global_id(1);
				1108
				1109	CLPixelType v = inputImage[y*imageWidth+x];
				1110
				1111	int2 neighbor;
				1112	neighbor.y = y + offset.y;
				1113	neighbor.x = x + offset.x;
				1114
				1115	int2 clampedNeighbor;
				1116	clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
				1117	clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
				1118
				1119	CLPixelType r = (clampedNeighbor.x == neighbor.x
				1120	&& clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
				1121	:(CLPixelType)0;
				1122
				1123	int sv[4];
				1124	sv[0] = (int)v.x;
				1125	sv[1] = (int)v.y;
				1126	sv[2] = (int)v.z;
				1127	sv[3] = (int)v.w;
				1128
				1129	int sr[4];
				1130	sr[0] = (int)r.x;
				1131	sr[1] = (int)r.y;
				1132	sr[2] = (int)r.z;
				1133	sr[3] = (int)r.w;
				1134
				1135	if (polarity > 0) {
				1136	\n #pragma unroll 4\n
				1137	for (unsigned int i = 0; i < 4; i++) {
				1138	sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
				1139	}
				1140	}
				1141	else {
				1142	\n #pragma unroll 4\n
				1143	for (unsigned int i = 0; i < 4; i++) {
				1144	sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
				1145	}
				1146
				1147	}
				1148
				1149	v.x = (CLQuantum)sv[0];
				1150	v.y = (CLQuantum)sv[1];
				1151	v.z = (CLQuantum)sv[2];
				1152
				1153	if (matte!=0)
				1154	v.w = (CLQuantum)sv[3];
				1155
				1156	outputImage[y*imageWidth+x] = v;
				1157
				1158	}
				1159
				1160
				1161	)
				1162
				1163
				1164
				1165	STRINGIFY(
				1166
				1167	__kernel void HullPass2(const __global CLPixelType inputImage, __global CLPixelType outputImage
				1168	, const unsigned int imageWidth, const unsigned int imageHeight
				1169	, const int2 offset, const int polarity, const int matte) {
				1170
				1171	int x = get_global_id(0);
				1172	int y = get_global_id(1);
				1173
				1174	CLPixelType v = inputImage[y*imageWidth+x];
				1175
				1176	int2 neighbor, clampedNeighbor;
				1177
				1178	neighbor.y = y + offset.y;
				1179	neighbor.x = x + offset.x;
				1180	clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
				1181	clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
				1182
				1183	CLPixelType r = (clampedNeighbor.x == neighbor.x
				1184	&& clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
				1185	:(CLPixelType)0;
				1186
				1187
				1188	neighbor.y = y - offset.y;
				1189	neighbor.x = x - offset.x;
				1190	clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
				1191	clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
				1192
				1193	CLPixelType s = (clampedNeighbor.x == neighbor.x
				1194	&& clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
				1195	:(CLPixelType)0;
				1196
				1197
				1198	int sv[4];
				1199	sv[0] = (int)v.x;
				1200	sv[1] = (int)v.y;
				1201	sv[2] = (int)v.z;
				1202	sv[3] = (int)v.w;
				1203
				1204	int sr[4];
				1205	sr[0] = (int)r.x;
				1206	sr[1] = (int)r.y;
				1207	sr[2] = (int)r.z;
				1208	sr[3] = (int)r.w;
				1209
				1210	int ss[4];
				1211	ss[0] = (int)s.x;
				1212	ss[1] = (int)s.y;
				1213	ss[2] = (int)s.z;
				1214	ss[3] = (int)s.w;
				1215
				1216	if (polarity > 0) {
				1217	\n #pragma unroll 4\n
				1218	for (unsigned int i = 0; i < 4; i++) {
				1219	//sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
				1220	//
				1221	//sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
				1222	//sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) \|\| (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
				1223	sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
				1224	}
				1225	}
				1226	else {
				1227	\n #pragma unroll 4\n
				1228	for (unsigned int i = 0; i < 4; i++) {
				1229	//sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
				1230	//
				1231	//sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
				1232	sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
				1233	}
				1234	}
				1235
				1236	v.x = (CLQuantum)sv[0];
				1237	v.y = (CLQuantum)sv[1];
				1238	v.z = (CLQuantum)sv[2];
				1239
				1240	if (matte!=0)
				1241	v.w = (CLQuantum)sv[3];
				1242
				1243	outputImage[y*imageWidth+x] = v;
				1244
				1245	}
				1246
				1247
				1248	)
				1249
				1250
				1251	STRINGIFY(
				1252	__kernel void RadialBlur(const __global CLPixelType im, __global CLPixelType filtered_im,
				1253	const float4 bias,
				1254	const unsigned int channel, const unsigned int matte,
				1255	const float2 blurCenter,
				1256	__constant float cos_theta, __constant float sin_theta,
				1257	const unsigned int cossin_theta_size)
				1258	{
				1259	const int x = get_global_id(0);
				1260	const int y = get_global_id(1);
				1261	const int columns = get_global_size(0);
				1262	const int rows = get_global_size(1);
				1263	unsigned int step = 1;
				1264	float center_x = (float) x - blurCenter.x;
				1265	float center_y = (float) y - blurCenter.y;
				1266	float radius = hypot(center_x, center_y);
				1267
				1268	//float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
				1269	float blur_radius = hypot(blurCenter.x, blurCenter.y);
				1270
				1271	if (radius > MagickEpsilon)
				1272	{
				1273	step = (unsigned int) (blur_radius / radius);
				1274	if (step == 0)
				1275	step = 1;
				1276	if (step >= cossin_theta_size)
				1277	step = cossin_theta_size-1;
				1278	}
				1279
				1280	float4 result;
				1281	result.x = (float)bias.x;
				1282	result.y = (float)bias.y;
				1283	result.z = (float)bias.z;
				1284	result.w = (float)bias.w;
				1285	float normalize = 0.0f;
				1286
				1287	if (((channel & OpacityChannel) == 0) \|\| (matte == 0)) {
				1288	for (unsigned int i=0; i<cossin_theta_size; i+=step)
				1289	{
				1290	result += convert_float4(im[
				1291	ClampToCanvas(blurCenter.x+center_xcos_theta[i]-center_ysin_theta[i]+0.5f,columns)+
				1292	ClampToCanvas(blurCenter.y+center_xsin_theta[i]+center_ycos_theta[i]+0.5f, rows)*columns]);
				1293	normalize += 1.0f;
				1294	}
				1295	normalize = PerceptibleReciprocal(normalize);
				1296	result = result * normalize;
				1297	}
				1298	else {
				1299	float gamma = 0.0f;
				1300	for (unsigned int i=0; i<cossin_theta_size; i+=step)
				1301	{
				1302	float4 p = convert_float4(im[
				1303	ClampToCanvas(blurCenter.x+center_xcos_theta[i]-center_ysin_theta[i]+0.5f,columns)+
				1304	ClampToCanvas(blurCenter.y+center_xsin_theta[i]+center_ycos_theta[i]+0.5f, rows)*columns]);
				1305
				1306	float alpha = (float)(QuantumScale*(QuantumRange-p.w));
				1307	result.x += alpha * p.x;
				1308	result.y += alpha * p.y;
				1309	result.z += alpha * p.z;
				1310	result.w += p.w;
				1311	gamma+=alpha;
				1312	normalize += 1.0f;
				1313	}
				1314	gamma = PerceptibleReciprocal(gamma);
				1315	normalize = PerceptibleReciprocal(normalize);
				1316	result.x = gamma*result.x;
				1317	result.y = gamma*result.y;
				1318	result.z = gamma*result.z;
				1319	result.w = normalize*result.w;
				1320	}
				1321	filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
				1322	ClampToQuantum(result.z), ClampToQuantum(result.w));
				1323	}
				1324	)
				1325
				1326	STRINGIFY(
				1327	typedef enum
				1328	{
				1329	UndefinedColorspace,
				1330	RGBColorspace, /* Linear RGB colorspace */
				1331	GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */
				1332	TransparentColorspace,
				1333	OHTAColorspace,
				1334	LabColorspace,
				1335	XYZColorspace,
				1336	YCbCrColorspace,
				1337	YCCColorspace,
				1338	YIQColorspace,
				1339	YPbPrColorspace,
				1340	YUVColorspace,
				1341	CMYKColorspace, /* negared linear RGB with black separated */
				1342	sRGBColorspace, /* Default: non-lienar sRGB colorspace */
				1343	HSBColorspace,
				1344	HSLColorspace,
				1345	HWBColorspace,
				1346	Rec601LumaColorspace,
				1347	Rec601YCbCrColorspace,
				1348	Rec709LumaColorspace,
				1349	Rec709YCbCrColorspace,
				1350	LogColorspace,
				1351	CMYColorspace, /* negated linear RGB colorspace */
				1352	LuvColorspace,
				1353	HCLColorspace,
				1354	LCHColorspace, /* alias for LCHuv */
				1355	LMSColorspace,
				1356	LCHabColorspace, /* Cylindrical (Polar) Lab */
				1357	LCHuvColorspace, /* Cylindrical (Polar) Luv */
				1358	scRGBColorspace,
				1359	HSIColorspace,
				1360	HSVColorspace, /* alias for HSB */
				1361	HCLpColorspace,
				1362	YDbDrColorspace
				1363	} ColorspaceType;
				1364	)
				1365
				1366
				1367	STRINGIFY(
				1368
				1369	inline float3 ConvertRGBToHSB(CLPixelType pixel) {
				1370	float3 HueSaturationBrightness;
				1371	HueSaturationBrightness.x = 0.0f; // Hue
				1372	HueSaturationBrightness.y = 0.0f; // Saturation
				1373	HueSaturationBrightness.z = 0.0f; // Brightness
				1374
				1375	float r=(float) getRed(pixel);
				1376	float g=(float) getGreen(pixel);
				1377	float b=(float) getBlue(pixel);
				1378
				1379	float tmin=min(min(r,g),b);
				1380	float tmax=max(max(r,g),b);
				1381
				1382	if (tmax!=0.0f) {
				1383	float delta=tmax-tmin;
				1384	HueSaturationBrightness.y=delta/tmax;
				1385	HueSaturationBrightness.z=QuantumScale*tmax;
				1386
				1387	if (delta != 0.0f) {
				1388	HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
				1389	HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
				1390	HueSaturationBrightness.x/=6.0f;
				1391	HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
				1392	}
				1393	}
				1394	return HueSaturationBrightness;
				1395	}
				1396
				1397	inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
				1398
				1399	float hue = HueSaturationBrightness.x;
				1400	float brightness = HueSaturationBrightness.z;
				1401	float saturation = HueSaturationBrightness.y;
				1402
				1403	CLPixelType rgb;
				1404
				1405	if (saturation == 0.0f) {
				1406	setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
				1407	setGreen(&rgb,getRed(rgb));
				1408	setBlue(&rgb,getRed(rgb));
				1409	}
				1410	else {
				1411
				1412	float h=6.0f*(hue-floor(hue));
				1413	float f=h-floor(h);
				1414	float p=brightness*(1.0f-saturation);
				1415	float q=brightness(1.0f-saturationf);
				1416	float t=brightness(1.0f-(saturation(1.0f-f)));
				1417
				1418	float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
				1419	float clamped_t = ClampToQuantum(QuantumRange*t);
				1420	float clamped_p = ClampToQuantum(QuantumRange*p);
				1421	float clamped_q = ClampToQuantum(QuantumRange*q);
				1422	int ih = (int)h;
				1423	setRed(&rgb, (ih == 1)?clamped_q:
				1424	(ih == 2 \|\| ih == 3)?clamped_p:
				1425	(ih == 4)?clamped_t:
				1426	clampedBrightness);
				1427
				1428	setGreen(&rgb, (ih == 1 \|\| ih == 2)?clampedBrightness:
				1429	(ih == 3)?clamped_q:
				1430	(ih == 4 \|\| ih == 5)?clamped_p:
				1431	clamped_t);
				1432
				1433	setBlue(&rgb, (ih == 2)?clamped_t:
				1434	(ih == 3 \|\| ih == 4)?clampedBrightness:
				1435	(ih == 5)?clamped_q:
				1436	clamped_p);
				1437	}
				1438	return rgb;
				1439	}
				1440
				1441	__kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
				1442	{
				1443
				1444	const int sign = sharpen!=0?1:-1;
				1445	const int x = get_global_id(0);
				1446	const int y = get_global_id(1);
				1447	const int columns = get_global_size(0);
				1448	const int c = x + y * columns;
				1449
				1450	CLPixelType pixel = im[c];
				1451	float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
				1452	float brightness = HueSaturationBrightness.z;
				1453	brightness+=0.5fsign(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
				1454	brightness = clamp(brightness,0.0f,1.0f);
				1455	HueSaturationBrightness.z = brightness;
				1456
				1457	CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
				1458	filteredPixel.w = pixel.w;
				1459	im[c] = filteredPixel;
				1460	}
				1461
				1462
				1463	)
				1464
				1465	STRINGIFY(
				1466
				1467	inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
				1468	float hue, float saturation, float *lightness)
				1469	{
				1470	float
				1471	c,
				1472	tmax,
				1473	tmin;
				1474
				1475	/*
				1476	Convert RGB to HSL colorspace.
				1477	*/
				1478	tmax=max(QuantumScalered,max(QuantumScalegreen, QuantumScale*blue));
				1479	tmin=min(QuantumScalered,min(QuantumScalegreen, QuantumScale*blue));
				1480
				1481	c=tmax-tmin;
				1482
				1483	*lightness=(tmax+tmin)/2.0;
				1484	if (c <= 0.0)
				1485	{
				1486	*hue=0.0;
				1487	*saturation=0.0;
				1488	return;
				1489	}
				1490
				1491	if (tmax == (QuantumScale*red))
				1492	{
				1493	hue=(QuantumScalegreen-QuantumScale*blue)/c;
				1494	if ((QuantumScalegreen) < (QuantumScaleblue))
				1495	*hue+=6.0;
				1496	}
				1497	else
				1498	if (tmax == (QuantumScale*green))
				1499	hue=2.0+(QuantumScaleblue-QuantumScale*red)/c;
				1500	else
				1501	hue=4.0+(QuantumScalered-QuantumScale*green)/c;
				1502
				1503	hue=60.0/360.0;
				1504	if (*lightness <= 0.5)
				1505	saturation=c/(2.0(*lightness));
				1506	else
				1507	saturation=c/(2.0-2.0(*lightness));
				1508	}
				1509
				1510	inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
				1511	CLQuantum red,CLQuantum green,CLQuantum *blue)
				1512	{
				1513	float
				1514	b,
				1515	c,
				1516	g,
				1517	h,
				1518	tmin,
				1519	r,
				1520	x;
				1521
				1522	/*
				1523	Convert HSL to RGB colorspace.
				1524	*/
				1525	h=hue*360.0;
				1526	if (lightness <= 0.5)
				1527	c=2.0lightnesssaturation;
				1528	else
				1529	c=(2.0-2.0lightness)saturation;
				1530	tmin=lightness-0.5*c;
				1531	h-=360.0*floor(h/360.0);
				1532	h/=60.0;
				1533	x=c(1.0-fabs(h-2.0floor(h/2.0)-1.0));
				1534	switch ((int) floor(h))
				1535	{
				1536	case 0:
				1537	{
				1538	r=tmin+c;
				1539	g=tmin+x;
				1540	b=tmin;
				1541	break;
				1542	}
				1543	case 1:
				1544	{
				1545	r=tmin+x;
				1546	g=tmin+c;
				1547	b=tmin;
				1548	break;
				1549	}
				1550	case 2:
				1551	{
				1552	r=tmin;
				1553	g=tmin+c;
				1554	b=tmin+x;
				1555	break;
				1556	}
				1557	case 3:
				1558	{
				1559	r=tmin;
				1560	g=tmin+x;
				1561	b=tmin+c;
				1562	break;
				1563	}
				1564	case 4:
				1565	{
				1566	r=tmin+x;
				1567	g=tmin;
				1568	b=tmin+c;
				1569	break;
				1570	}
				1571	case 5:
				1572	{
				1573	r=tmin+c;
				1574	g=tmin;
				1575	b=tmin+x;
				1576	break;
				1577	}
				1578	default:
				1579	{
				1580	r=0.0;
				1581	g=0.0;
				1582	b=0.0;
				1583	}
				1584	}
				1585	red=ClampToQuantum(QuantumRanger);
				1586	green=ClampToQuantum(QuantumRangeg);
				1587	blue=ClampToQuantum(QuantumRangeb);
				1588	}
				1589
				1590	inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
				1591	CLQuantum red,CLQuantum green,CLQuantum *blue)
				1592	{
				1593	float
				1594	hue,
				1595	lightness,
				1596	saturation;
				1597
				1598	/*
				1599	Increase or decrease color lightness, saturation, or hue.
				1600	*/
				1601	ConvertRGBToHSL(red,green,*blue,&hue,&saturation,&lightness);
				1602	hue+=0.5(0.01percent_hue-1.0);
				1603	while (hue < 0.0)
				1604	hue+=1.0;
				1605	while (hue >= 1.0)
				1606	hue-=1.0;
				1607	saturation=0.01percent_saturation;
				1608	lightness=0.01percent_lightness;
				1609	ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
				1610	}
				1611
				1612	__kernel void Modulate(__global CLPixelType *im,
				1613	const float percent_brightness,
				1614	const float percent_hue,
				1615	const float percent_saturation,
				1616	const int colorspace)
				1617	{
				1618
				1619	const int x = get_global_id(0);
				1620	const int y = get_global_id(1);
				1621	const int columns = get_global_size(0);
				1622	const int c = x + y * columns;
				1623
				1624	CLPixelType pixel = im[c];
				1625
				1626	CLQuantum
				1627	blue,
				1628	green,
				1629	red;
				1630
				1631	red=getRed(pixel);
				1632	green=getGreen(pixel);
				1633	blue=getBlue(pixel);
				1634
				1635	switch (colorspace)
				1636	{
				1637	case HSLColorspace:
				1638	default:
				1639	{
				1640	ModulateHSL(percent_hue, percent_saturation, percent_brightness,
				1641	&red, &green, &blue);
				1642	}
				1643
				1644	}
				1645
				1646	CLPixelType filteredPixel;
				1647
				1648	setRed(&filteredPixel, red);
				1649	setGreen(&filteredPixel, green);
				1650	setBlue(&filteredPixel, blue);
				1651	filteredPixel.w = pixel.w;
				1652
				1653	im[c] = filteredPixel;
				1654	}
				1655	)
				1656
				1657	STRINGIFY(
				1658	// Based on Box from resize.c
				1659	float BoxResizeFilter(const float x)
				1660	{
				1661	return 1.0f;
				1662	}
				1663	)
				1664
				1665	STRINGIFY(
				1666	// Based on CubicBC from resize.c
				1667	float CubicBC(const float x,const __global float* resizeFilterCoefficients)
				1668	{
				1669	/*
				1670	Cubic Filters using B,C determined values:
				1671	Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
				1672	Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
				1673	Spline B = 1 C = 0 B-Spline Gaussian approximation
				1674	Hermite B = 0 C = 0 B-Spline interpolator
				1675
				1676	See paper by Mitchell and Netravali, Reconstruction Filters in Computer
				1677	Graphics Computer Graphics, Volume 22, Number 4, August 1988
				1678	http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
				1679	Mitchell.pdf.
				1680
				1681	Coefficents are determined from B,C values:
				1682	P0 = ( 6 - 2*B )/6 = coeff[0]
				1683	P1 = 0
				1684	P2 = (-18 +12B + 6C )/6 = coeff[1]
				1685	P3 = ( 12 - 9B - 6C )/6 = coeff[2]
				1686	Q0 = ( 8B +24C )/6 = coeff[3]
				1687	Q1 = ( -12B -48C )/6 = coeff[4]
				1688	Q2 = ( 6B +30C )/6 = coeff[5]
				1689	Q3 = ( - 1B - 6C )/6 = coeff[6]
				1690
				1691	which are used to define the filter:
				1692
				1693	P0 + P1x + P2x^2 + P3*x^3 0 <= x < 1
				1694	Q0 + Q1x + Q2x^2 + Q3*x^3 1 <= x < 2
				1695
				1696	which ensures function is continuous in value and derivative (slope).
				1697	*/
				1698	if (x < 1.0)
				1699	return(resizeFilterCoefficients[0]+x(x
				1700	(resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
				1701	if (x < 2.0)
				1702	return(resizeFilterCoefficients[3]+x(resizeFilterCoefficients[4]+x
				1703	(resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
				1704	return(0.0);
				1705	}
				1706	)
				1707
				1708	STRINGIFY(
				1709	float Sinc(const float x)
				1710	{
				1711	if (x != 0.0f)
				1712	{
				1713	const float alpha=(float) (MagickPI*x);
				1714	return sinpi(x)/alpha;
				1715	}
				1716	return(1.0f);
				1717	}
				1718	)
				1719
				1720	STRINGIFY(
				1721	float Triangle(const float x)
				1722	{
				1723	/*
				1724	1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
				1725	a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
				1726	for Sinc().
				1727	*/
				1728	return ((x<1.0f)?(1.0f-x):0.0f);
				1729	}
				1730	)
				1731
				1732
				1733	STRINGIFY(
				1734	float Hanning(const float x)
				1735	{
				1736	/*
				1737	Cosine window function:
				1738	0.5+0.5cos(pix).
				1739	*/
				1740	const float cosine=cos((MagickPI*x));
				1741	return(0.5f+0.5f*cosine);
				1742	}
				1743	)
				1744
				1745	STRINGIFY(
				1746	float Hamming(const float x)
				1747	{
				1748	/*
				1749	Offset cosine window function:
				1750	.54 + .46 cos(pi x).
				1751	*/
				1752	const float cosine=cos((MagickPI*x));
				1753	return(0.54f+0.46f*cosine);
				1754	}
				1755	)
				1756
				1757	STRINGIFY(
				1758	float Blackman(const float x)
				1759	{
				1760	/*
				1761	Blackman: 2nd order cosine windowing function:
				1762	0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
				1763
				1764	Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
				1765	five flops.
				1766	*/
				1767	const float cosine=cos((MagickPI*x));
				1768	return(0.34f+cosine(0.5f+cosine0.16f));
				1769	}
				1770	)
				1771
				1772
				1773	STRINGIFY(
				1774	typedef enum {
				1775	BoxWeightingFunction = 0,
				1776	TriangleWeightingFunction,
				1777	CubicBCWeightingFunction,
				1778	HanningWeightingFunction,
				1779	HammingWeightingFunction,
				1780	BlackmanWeightingFunction,
				1781	GaussianWeightingFunction,
				1782	QuadraticWeightingFunction,
				1783	JincWeightingFunction,
				1784	SincWeightingFunction,
				1785	SincFastWeightingFunction,
				1786	KaiserWeightingFunction,
				1787	WelshWeightingFunction,
				1788	BohmanWeightingFunction,
				1789	LagrangeWeightingFunction,
				1790	CosineWeightingFunction,
				1791	} ResizeWeightingFunctionType;
				1792	)
				1793
				1794	STRINGIFY(
				1795	inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
				1796	{
				1797	switch (filterType)
				1798	{
				1799	/* Call Sinc even for SincFast to get better precision on GPU
				1800	and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
				1801	case SincWeightingFunction:
				1802	case SincFastWeightingFunction:
				1803	return Sinc(x);
				1804	case CubicBCWeightingFunction:
				1805	return CubicBC(x,filterCoefficients);
				1806	case BoxWeightingFunction:
				1807	return BoxResizeFilter(x);
				1808	case TriangleWeightingFunction:
				1809	return Triangle(x);
				1810	case HanningWeightingFunction:
				1811	return Hanning(x);
				1812	case HammingWeightingFunction:
				1813	return Hamming(x);
				1814	case BlackmanWeightingFunction:
				1815	return Blackman(x);
				1816
				1817	default:
				1818	return 0.0f;
				1819	}
				1820	}
				1821	)
				1822
				1823
				1824	STRINGIFY(
				1825	inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
				1826	, const ResizeWeightingFunctionType resizeWindowType
				1827	, const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
				1828	{
				1829	float scale;
				1830	float xBlur = fabs(x/resizeFilterBlur);
				1831	if (resizeWindowSupport < MagickEpsilon
				1832	\|\| resizeWindowType == BoxWeightingFunction)
				1833	{
				1834	scale = 1.0f;
				1835	}
				1836	else
				1837	{
				1838	scale = resizeFilterScale;
				1839	scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
				1840	}
				1841	float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
				1842	return weight;
				1843	}
				1844
				1845	)
				1846
				1847	;
				1848	const char* accelerateKernels2 =
				1849
				1850	STRINGIFY(
				1851
				1852	inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
				1853	return (numWorkItems/pixelPerWorkgroup);
				1854	}
				1855
				1856	// returns the index of the pixel for the current workitem to compute.
				1857	// returns -1 if this workitem doesn't need to participate in any computation
				1858	inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
				1859	const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
				1860	int pixelIndex = itemID/numWorkItemsPerPixel;
				1861	pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
				1862	return pixelIndex;
				1863	}
				1864
				1865	)
				1866
				1867	STRINGIFY(
cristy	e85d0f7	2013-11-27 02:25:43 +0000	[diff] [blame]	1868	__kernel __attribute__((reqd_work_group_size(256, 1, 1)))
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	1869	void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
				1870	, const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
				1871	, const int resizeFilterType, const int resizeWindowType
				1872	, const __global float* resizeFilterCubicCoefficients
				1873	, const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
				1874	, __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
				1875	, __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
				1876
				1877
				1878	// calculate the range of resized image pixels computed by this workgroup
				1879	const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
				1880	const unsigned int stopX = min(startX + pixelPerWorkgroup,filteredColumns);
				1881	const unsigned int actualNumPixelToCompute = stopX - startX;
				1882
				1883	// calculate the range of input image pixels to cache
				1884	float scale = max(1.0/xFactor+MagickEpsilon ,1.0f);
				1885	const float support = max(scale*resizeFilterSupport,0.5f);
				1886	scale = PerceptibleReciprocal(scale);
				1887
				1888	const int cacheRangeStartX = max((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
				1889	const int cacheRangeEndX = min((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
				1890
				1891	// cache the input pixels into local memory
				1892	const unsigned int y = get_global_id(1);
				1893	event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
				1894	wait_group_events(1,&e);
				1895
				1896	unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
				1897	for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
				1898	{
				1899
				1900	const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
				1901	const unsigned int chunkStopX = min(chunkStartX + pixelChunkSize, stopX);
				1902	const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
				1903
				1904	// determine which resized pixel computed by this workitem
				1905	const unsigned int itemID = get_local_id(0);
				1906	const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
				1907
				1908	const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
				1909
				1910	float4 filteredPixel = (float4)0.0f;
				1911	float density = 0.0f;
				1912	float gamma = 0.0f;
				1913	// -1 means this workitem doesn't participate in the computation
				1914	if (pixelIndex != -1) {
				1915
				1916	// x coordinated of the resized pixel computed by this workitem
				1917	const int x = chunkStartX + pixelIndex;
				1918
				1919	// calculate how many steps required for this pixel
				1920	const float bisect = (x+0.5)/xFactor+MagickEpsilon;
				1921	const unsigned int start = (unsigned int)max(bisect-support+0.5f,0.0f);
				1922	const unsigned int stop = (unsigned int)min(bisect+support+0.5f,(float)inputColumns);
				1923	const unsigned int n = stop - start;
				1924
				1925	// calculate how many steps this workitem will contribute
				1926	unsigned int numStepsPerWorkItem = n / numItems;
				1927	numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
				1928
				1929	const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
				1930	if (startStep < n) {
				1931	const unsigned int stopStep = min(startStep+numStepsPerWorkItem, n);
				1932
				1933	unsigned int cacheIndex = start+startStep-cacheRangeStartX;
				1934	if (matte == 0) {
				1935
				1936	for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
				1937	float4 cp = convert_float4(inputImageCache[cacheIndex]);
				1938
				1939	float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
				1940	, (ResizeWeightingFunctionType)resizeWindowType
				1941	, resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
				1942
				1943	filteredPixel += ((float4)weight)*cp;
				1944	density+=weight;
				1945	}
				1946
				1947
				1948	}
				1949	else {
				1950	for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
				1951	CLPixelType p = inputImageCache[cacheIndex];
				1952
				1953	float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
				1954	, (ResizeWeightingFunctionType)resizeWindowType
				1955	, resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
				1956
				1957	float alpha = weight * QuantumScale * GetPixelAlpha(p);
				1958	float4 cp = convert_float4(p);
				1959
				1960	filteredPixel.x += alpha * cp.x;
				1961	filteredPixel.y += alpha * cp.y;
				1962	filteredPixel.z += alpha * cp.z;
				1963	filteredPixel.w += weight * cp.w;
				1964
				1965	density+=weight;
				1966	gamma+=alpha;
				1967	}
				1968	}
				1969	}
				1970	}
				1971
				1972	// initialize the accumulators to zero
				1973	if (itemID < actualNumPixelInThisChunk) {
				1974	outputPixelCache[itemID] = (float4)0.0f;
				1975	densityCache[itemID] = 0.0f;
				1976	if (matte != 0)
				1977	gammaCache[itemID] = 0.0f;
				1978	}
				1979	barrier(CLK_LOCAL_MEM_FENCE);
				1980
				1981	// accumulatte the filtered pixel value and the density
				1982	for (unsigned int i = 0; i < numItems; i++) {
				1983	if (pixelIndex != -1) {
				1984	if (itemID%numItems == i) {
				1985	outputPixelCache[pixelIndex]+=filteredPixel;
				1986	densityCache[pixelIndex]+=density;
				1987	if (matte!=0) {
				1988	gammaCache[pixelIndex]+=gamma;
				1989	}
				1990	}
				1991	}
				1992	barrier(CLK_LOCAL_MEM_FENCE);
				1993	}
				1994
				1995	if (itemID < actualNumPixelInThisChunk) {
				1996	if (matte==0) {
				1997	float density = densityCache[itemID];
				1998	float4 filteredPixel = outputPixelCache[itemID];
				1999	if (density!= 0.0f && density != 1.0)
				2000	{
				2001	density = PerceptibleReciprocal(density);
				2002	filteredPixel *= (float4)density;
				2003	}
				2004	filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
				2005	, ClampToQuantum(filteredPixel.y)
				2006	, ClampToQuantum(filteredPixel.z)
				2007	, ClampToQuantum(filteredPixel.w));
				2008	}
				2009	else {
				2010	float density = densityCache[itemID];
				2011	float gamma = gammaCache[itemID];
				2012	float4 filteredPixel = outputPixelCache[itemID];
				2013
				2014	if (density!= 0.0f && density != 1.0) {
				2015	density = PerceptibleReciprocal(density);
				2016	filteredPixel *= (float4)density;
				2017	gamma *= density;
				2018	}
				2019	gamma = PerceptibleReciprocal(gamma);
				2020
				2021	CLPixelType fp;
				2022	fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
				2023	, ClampToQuantum(gamma*filteredPixel.y)
				2024	, ClampToQuantum(gamma*filteredPixel.z)
				2025	, ClampToQuantum(filteredPixel.w));
				2026
				2027	filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
				2028
				2029	}
				2030	}
				2031
				2032	} // end of chunking loop
				2033	}
				2034	)
				2035
				2036
				2037
				2038	STRINGIFY(
				2039	__kernel __attribute__((reqd_work_group_size(256, 1, 1)))
				2040	void ResizeHorizontalFilterSinc(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
				2041	, const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
				2042	, const int resizeFilterType, const int resizeWindowType
				2043	, const __global float* resizeFilterCubicCoefficients
				2044	, const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
				2045	, __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
				2046	, __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
				2047
				2048	ResizeHorizontalFilter(inputImage,inputColumns,inputRows,matte
				2049	,xFactor, filteredImage, filteredColumns, filteredRows
				2050	,SincWeightingFunction, SincWeightingFunction
				2051	,resizeFilterCubicCoefficients
				2052	,resizeFilterScale, resizeFilterSupport, resizeFilterWindowSupport, resizeFilterBlur
				2053	,inputImageCache, numCachedPixels, pixelPerWorkgroup, pixelChunkSize
				2054	,outputPixelCache, densityCache, gammaCache);
				2055
				2056	}
				2057	)
				2058
				2059
				2060	STRINGIFY(
cristy	e85d0f7	2013-11-27 02:25:43 +0000	[diff] [blame]	2061	__kernel __attribute__((reqd_work_group_size(1, 256, 1)))
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	2062	void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
				2063	, const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
				2064	, const int resizeFilterType, const int resizeWindowType
				2065	, const __global float* resizeFilterCubicCoefficients
				2066	, const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
				2067	, __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
				2068	, __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
				2069
				2070
				2071	// calculate the range of resized image pixels computed by this workgroup
				2072	const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
				2073	const unsigned int stopY = min(startY + pixelPerWorkgroup,filteredRows);
				2074	const unsigned int actualNumPixelToCompute = stopY - startY;
				2075
				2076	// calculate the range of input image pixels to cache
				2077	float scale = max(1.0/yFactor+MagickEpsilon ,1.0f);
				2078	const float support = max(scale*resizeFilterSupport,0.5f);
				2079	scale = PerceptibleReciprocal(scale);
				2080
				2081	const int cacheRangeStartY = max((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
				2082	const int cacheRangeEndY = min((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
				2083
				2084	// cache the input pixels into local memory
				2085	const unsigned int x = get_global_id(0);
				2086	event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
				2087	wait_group_events(1,&e);
				2088
				2089	unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
				2090	for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
				2091	{
				2092
				2093	const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
				2094	const unsigned int chunkStopY = min(chunkStartY + pixelChunkSize, stopY);
				2095	const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
				2096
				2097	// determine which resized pixel computed by this workitem
				2098	const unsigned int itemID = get_local_id(1);
				2099	const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
				2100
				2101	const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
				2102
				2103	float4 filteredPixel = (float4)0.0f;
				2104	float density = 0.0f;
				2105	float gamma = 0.0f;
				2106	// -1 means this workitem doesn't participate in the computation
				2107	if (pixelIndex != -1) {
				2108
				2109	// x coordinated of the resized pixel computed by this workitem
				2110	const int y = chunkStartY + pixelIndex;
				2111
				2112	// calculate how many steps required for this pixel
				2113	const float bisect = (y+0.5)/yFactor+MagickEpsilon;
				2114	const unsigned int start = (unsigned int)max(bisect-support+0.5f,0.0f);
				2115	const unsigned int stop = (unsigned int)min(bisect+support+0.5f,(float)inputRows);
				2116	const unsigned int n = stop - start;
				2117
				2118	// calculate how many steps this workitem will contribute
				2119	unsigned int numStepsPerWorkItem = n / numItems;
				2120	numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
				2121
				2122	const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
				2123	if (startStep < n) {
				2124	const unsigned int stopStep = min(startStep+numStepsPerWorkItem, n);
				2125
				2126	unsigned int cacheIndex = start+startStep-cacheRangeStartY;
				2127	if (matte == 0) {
				2128
				2129	for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
				2130	float4 cp = convert_float4(inputImageCache[cacheIndex]);
				2131
				2132	float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
				2133	, (ResizeWeightingFunctionType)resizeWindowType
				2134	, resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
				2135
				2136	filteredPixel += ((float4)weight)*cp;
				2137	density+=weight;
				2138	}
				2139
				2140
				2141	}
				2142	else {
				2143	for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
				2144	CLPixelType p = inputImageCache[cacheIndex];
				2145
				2146	float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
				2147	, (ResizeWeightingFunctionType)resizeWindowType
				2148	, resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
				2149
				2150	float alpha = weight * QuantumScale * GetPixelAlpha(p);
				2151	float4 cp = convert_float4(p);
				2152
				2153	filteredPixel.x += alpha * cp.x;
				2154	filteredPixel.y += alpha * cp.y;
				2155	filteredPixel.z += alpha * cp.z;
				2156	filteredPixel.w += weight * cp.w;
				2157
				2158	density+=weight;
				2159	gamma+=alpha;
				2160	}
				2161	}
				2162	}
				2163	}
				2164
				2165	// initialize the accumulators to zero
				2166	if (itemID < actualNumPixelInThisChunk) {
				2167	outputPixelCache[itemID] = (float4)0.0f;
				2168	densityCache[itemID] = 0.0f;
				2169	if (matte != 0)
				2170	gammaCache[itemID] = 0.0f;
				2171	}
				2172	barrier(CLK_LOCAL_MEM_FENCE);
				2173
				2174	// accumulatte the filtered pixel value and the density
				2175	for (unsigned int i = 0; i < numItems; i++) {
				2176	if (pixelIndex != -1) {
				2177	if (itemID%numItems == i) {
				2178	outputPixelCache[pixelIndex]+=filteredPixel;
				2179	densityCache[pixelIndex]+=density;
				2180	if (matte!=0) {
				2181	gammaCache[pixelIndex]+=gamma;
				2182	}
				2183	}
				2184	}
				2185	barrier(CLK_LOCAL_MEM_FENCE);
				2186	}
				2187
				2188	if (itemID < actualNumPixelInThisChunk) {
				2189	if (matte==0) {
				2190	float density = densityCache[itemID];
				2191	float4 filteredPixel = outputPixelCache[itemID];
				2192	if (density!= 0.0f && density != 1.0)
				2193	{
				2194	density = PerceptibleReciprocal(density);
				2195	filteredPixel *= (float4)density;
				2196	}
				2197	filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
				2198	, ClampToQuantum(filteredPixel.y)
				2199	, ClampToQuantum(filteredPixel.z)
				2200	, ClampToQuantum(filteredPixel.w));
				2201	}
				2202	else {
				2203	float density = densityCache[itemID];
				2204	float gamma = gammaCache[itemID];
				2205	float4 filteredPixel = outputPixelCache[itemID];
				2206
				2207	if (density!= 0.0f && density != 1.0) {
				2208	density = PerceptibleReciprocal(density);
				2209	filteredPixel *= (float4)density;
				2210	gamma *= density;
				2211	}
				2212	gamma = PerceptibleReciprocal(gamma);
				2213
				2214	CLPixelType fp;
				2215	fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
				2216	, ClampToQuantum(gamma*filteredPixel.y)
				2217	, ClampToQuantum(gamma*filteredPixel.z)
				2218	, ClampToQuantum(filteredPixel.w));
				2219
				2220	filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
				2221
				2222	}
				2223	}
				2224
				2225	} // end of chunking loop
				2226	}
				2227	)
				2228
				2229
				2230
				2231	STRINGIFY(
cristy	e85d0f7	2013-11-27 02:25:43 +0000	[diff] [blame]	2232	__kernel __attribute__((reqd_work_group_size(1, 256, 1)))
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	2233	void ResizeVerticalFilterSinc(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
				2234	, const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
				2235	, const int resizeFilterType, const int resizeWindowType
				2236	, const __global float* resizeFilterCubicCoefficients
				2237	, const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
				2238	, __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
				2239	, __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
				2240	ResizeVerticalFilter(inputImage,inputColumns,inputRows,matte
				2241	,yFactor,filteredImage,filteredColumns,filteredRows
				2242	,SincWeightingFunction, SincWeightingFunction
				2243	,resizeFilterCubicCoefficients
				2244	,resizeFilterScale,resizeFilterSupport,resizeFilterWindowSupport,resizeFilterBlur
				2245	,inputImageCache,numCachedPixels,pixelPerWorkgroup,pixelChunkSize
				2246	,outputPixelCache,densityCache,gammaCache);
				2247	}
				2248	)
cristy	e85d0f7	2013-11-27 02:25:43 +0000	[diff] [blame]	2249
				2250	STRINGIFY(
				2251
				2252
				2253	__kernel void randomNumberGeneratorKernel(__global uint* seeds, const float normalizeRand
				2254	, __global float* randomNumbers, const uint init
				2255	,const uint numRandomNumbers) {
				2256
				2257	unsigned int id = get_global_id(0);
				2258	unsigned int seed[4];
				2259
				2260	if (init!=0) {
				2261	seed[0] = seeds[id*4];
				2262	seed[1] = 0x50a7f451;
				2263	seed[2] = 0x5365417e;
				2264	seed[3] = 0xc3a4171a;
				2265	}
				2266	else {
				2267	seed[0] = seeds[id*4];
				2268	seed[1] = seeds[id*4+1];
				2269	seed[2] = seeds[id*4+2];
				2270	seed[3] = seeds[id*4+3];
				2271	}
				2272
				2273	unsigned int numRandomNumbersPerItem = (numRandomNumbers+get_global_size(0)-1)/get_global_size(0);
				2274	for (unsigned int i = 0; i < numRandomNumbersPerItem; i++) {
				2275	do
				2276	{
				2277	unsigned int alpha=(unsigned int) (seed[1] ^ (seed[1] << 11));
				2278	seed[1]=seed[2];
				2279	seed[2]=seed[3];
				2280	seed[3]=seed[0];
				2281	seed[0]=(seed[0] ^ (seed[0] >> 19)) ^ (alpha ^ (alpha >> 8));
				2282	} while (seed[0] == ~0UL);
				2283	unsigned int pos = (get_group_id(0)get_local_size(0)numRandomNumbersPerItem)
				2284	+ get_local_size(0) * i + get_local_id(0);
				2285
				2286	if (pos >= numRandomNumbers)
				2287	break;
				2288	randomNumbers[pos] = normalizeRand*seed[0];
				2289	}
				2290
				2291	/* save the seeds for the time*/
				2292	seeds[id*4] = seed[0];
				2293	seeds[id*4+1] = seed[1];
				2294	seeds[id*4+2] = seed[2];
				2295	seeds[id*4+3] = seed[3];
				2296	}
				2297
				2298	)
				2299
				2300
				2301	STRINGIFY(
				2302
				2303	typedef enum
				2304	{
				2305	UndefinedNoise,
				2306	UniformNoise,
				2307	GaussianNoise,
				2308	MultiplicativeGaussianNoise,
				2309	ImpulseNoise,
				2310	LaplacianNoise,
				2311	PoissonNoise,
				2312	RandomNoise
				2313	} NoiseType;
				2314
				2315	typedef struct {
				2316	const global float* rns;
				2317	} RandomNumbers;
				2318
				2319
				2320	float GetPseudoRandomValue(RandomNumbers* r) {
				2321	float v = *r->rns;
				2322	r->rns++;
				2323	return v;
				2324	}
				2325	)
				2326
				2327	OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
				2328	OPENCL_DEFINE(SigmaGaussian,(attenuate*0.015625f))
				2329	OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
				2330	OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
				2331	OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
				2332	OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
				2333	OPENCL_DEFINE(SigmaRandom, (attenuate))
				2334	OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
				2335
				2336	STRINGIFY(
				2337	float GenerateDifferentialNoise(RandomNumbers* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
				2338
				2339	float
				2340	alpha,
				2341	beta,
				2342	noise,
				2343	sigma;
				2344
				2345	noise = 0.0f;
				2346	alpha=GetPseudoRandomValue(r);
				2347	switch(noise_type) {
				2348	case UniformNoise:
				2349	default:
				2350	{
				2351	noise=(pixel+QuantumRangeSigmaUniform(alpha-0.5f));
				2352	break;
				2353	}
				2354	case GaussianNoise:
				2355	{
				2356	float
				2357	gamma,
				2358	tau;
				2359
				2360	if (alpha == 0.0f)
				2361	alpha=1.0f;
				2362	beta=GetPseudoRandomValue(r);
				2363	gamma=sqrt(-2.0f*log(alpha));
				2364	sigma=gammacospi((2.0fbeta));
				2365	tau=gammasinpi((2.0fbeta));
				2366	noise=(float)(pixel+sqrt((float) pixel)SigmaGaussiansigma+
				2367	QuantumRangeTauGaussiantau);
				2368	break;
				2369	}
				2370
				2371
				2372	case ImpulseNoise:
				2373	{
				2374	if (alpha < (SigmaImpulse/2.0f))
				2375	noise=0.0f;
				2376	else
				2377	if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
				2378	noise=(float)QuantumRange;
				2379	else
				2380	noise=(float)pixel;
				2381	break;
				2382	}
				2383	case LaplacianNoise:
				2384	{
				2385	if (alpha <= 0.5f)
				2386	{
				2387	if (alpha <= MagickEpsilon)
				2388	noise=(float) (pixel-QuantumRange);
				2389	else
				2390	noise=(float) (pixel+QuantumRangeSigmaLaplacianlog(2.0f*alpha)+
				2391	0.5f);
				2392	break;
				2393	}
				2394	beta=1.0f-alpha;
				2395	if (beta <= (0.5f*MagickEpsilon))
				2396	noise=(float) (pixel+QuantumRange);
				2397	else
				2398	noise=(float) (pixel-QuantumRangeSigmaLaplacianlog(2.0f*beta)+0.5f);
				2399	break;
				2400	}
				2401	case MultiplicativeGaussianNoise:
				2402	{
				2403	sigma=1.0f;
				2404	if (alpha > MagickEpsilon)
				2405	sigma=sqrt(-2.0f*log(alpha));
				2406	beta=GetPseudoRandomValue(r);
				2407	noise=(float) (pixel+pixelSigmaMultiplicativeGaussiansigma*
				2408	cospi((float) (2.0f*beta))/2.0f);
				2409	break;
				2410	}
				2411	case PoissonNoise:
				2412	{
				2413	float
				2414	poisson;
				2415	unsigned int i;
				2416	poisson=exp(-SigmaPoissonQuantumScalepixel);
				2417	for (i=0; alpha > poisson; i++)
				2418	{
				2419	beta=GetPseudoRandomValue(r);
				2420	alpha*=beta;
				2421	}
				2422	noise=(float) (QuantumRange*i/SigmaPoisson);
				2423	break;
				2424	}
				2425	case RandomNoise:
				2426	{
				2427	noise=(float) (QuantumRangeSigmaRandomalpha);
				2428	break;
				2429	}
				2430
				2431	};
				2432	return noise;
				2433	}
				2434
				2435	__kernel
				2436	void AddNoiseImage(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
				2437	,const unsigned int inputColumns, const unsigned int inputRows
				2438	,const ChannelType channel
				2439	,const NoiseType noise_type, const float attenuate
				2440	,const __global float* randomNumbers, const unsigned int numRandomNumbersPerPixel
				2441	,const unsigned int rowOffset) {
				2442
				2443	unsigned int x = get_global_id(0);
				2444	unsigned int y = get_global_id(1) + rowOffset;
				2445	RandomNumbers r;
				2446	r.rns = randomNumbers + (get_global_id(1) * inputColumns + get_global_id(0))*numRandomNumbersPerPixel;
				2447
				2448	CLPixelType p = inputImage[y*inputColumns+x];
				2449	CLPixelType q = filteredImage[y*inputColumns+x];
				2450
				2451	if ((channel&RedChannel)!=0) {
				2452	setRed(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getRed(p),noise_type,attenuate)));
				2453	}
				2454
				2455	if ((channel&GreenChannel)!=0) {
				2456	setGreen(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getGreen(p),noise_type,attenuate)));
				2457	}
				2458
				2459	if ((channel&BlueChannel)!=0) {
				2460	setBlue(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getBlue(p),noise_type,attenuate)));
				2461	}
				2462
				2463	if ((channel & OpacityChannel) != 0) {
				2464	setOpacity(&q,ClampToQuantum(GenerateDifferentialNoise(&r,getOpacity(p),noise_type,attenuate)));
				2465	}
				2466
				2467	filteredImage[y*inputColumns+x] = q;
				2468	}
				2469
				2470	)
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	2471	;
				2472
				2473
cristy	e85d0f7	2013-11-27 02:25:43 +0000	[diff] [blame]	2474
				2475
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	2476	#endif // MAGICKCORE_OPENCL_SUPPORT
				2477
cristy	e110189	2013-08-18 00:53:18 +0000	[diff] [blame]	2478	#if defined(__cplusplus) \|\| defined(c_plusplus)
				2479	}
				2480	#endif
				2481
cristy	f034abb	2013-11-24 14:16:14 +0000	[diff] [blame]	2482	#endif // _MAGICKCORE_ACCELERATE_PRIVATE_H