drm: Use a little stash on the stack to avoid kmalloc in most DRM ioctls.

The kmalloc was taking up about 1.5% of the CPU on an ioctl-heavy workload
(x11perf -aa10text on 965).  Initial results look like they have a
corresponding improvement in performance for aa10text, but more numbers might
not hurt.

Thanks to ajax for pointing out this performance regression I'd introduced
back in 2007.

[airlied: well I introduced it sneakily inside Eric's patch]

Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Dave Airlie <airlied@redhat.com>
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index c26ee08..c4ada8b 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -421,6 +421,7 @@
 	drm_ioctl_t *func;
 	unsigned int nr = DRM_IOCTL_NR(cmd);
 	int retcode = -EINVAL;
+	char stack_kdata[128];
 	char *kdata = NULL;
 
 	atomic_inc(&dev->ioctl_count);
@@ -459,10 +460,14 @@
 		retcode = -EACCES;
 	} else {
 		if (cmd & (IOC_IN | IOC_OUT)) {
-			kdata = kmalloc(_IOC_SIZE(cmd), GFP_KERNEL);
-			if (!kdata) {
-				retcode = -ENOMEM;
-				goto err_i1;
+			if (_IOC_SIZE(cmd) <= sizeof(stack_kdata)) {
+				kdata = stack_kdata;
+			} else {
+				kdata = kmalloc(_IOC_SIZE(cmd), GFP_KERNEL);
+				if (!kdata) {
+					retcode = -ENOMEM;
+					goto err_i1;
+				}
 			}
 		}
 
@@ -483,7 +488,7 @@
 	}
 
       err_i1:
-	if (kdata)
+	if (kdata != stack_kdata)
 		kfree(kdata);
 	atomic_dec(&dev->ioctl_count);
 	if (retcode)