[PATCH] create and destroy cpufreq sysfs entries based on cpu notifiers

cpufreq entries in sysfs should only be populated when CPU is online state.
 When we either boot with maxcpus=x and then boot the other cpus by echoing
to sysfs online file, these entries should be created and destroyed when
CPU_DEAD is notified.  Same treatement as cache entries under sysfs.

We place the processor in the lowest frequency, so hw managed P-State
transitions can still work on the other threads to save power.

Primary goal was to just make these directories appear/disapper dynamically.

There is one in this patch i had to do, which i really dont like myself but
probably best if someone handling the cpufreq infrastructure could give
this code right treatment if this is not acceptable.  I guess its probably
good for the first cut.

- Converting lock_cpu_hotplug()/unlock_cpu_hotplug() to disable/enable preempt.
  The locking was smack in the middle of the notification path, when the
  hotplug is already holding the lock. I tried another solution to avoid this
  so avoid taking locks if we know we are from notification path. The solution
  was getting very ugly and i decided this was probably good for this iteration
  until someone who understands cpufreq could do a better job than me.

(akpm: export cpucontrol to GPL modules: drivers/cpufreq/cpufreq_stats.c now
does lock_cpu_hotplug())

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Zwane Mwaikambo <zwane@holomorphy.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 346906a..6c6121b 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -4,6 +4,9 @@
  *  Copyright (C) 2001 Russell King
  *            (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
  *
+ *  Oct 2005 - Ashok Raj <ashok.raj@intel.com>
+ *         		Added handling for CPU hotplug
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -567,6 +570,9 @@
 	unsigned long flags;
 	unsigned int j;
 
+	if (cpu_is_offline(cpu))
+		return 0;
+
 	cpufreq_debug_disable_ratelimit();
 	dprintk("adding CPU %u\n", cpu);
 
@@ -673,7 +679,7 @@
 
 nomem_out:
 	module_put(cpufreq_driver->owner);
- module_out:
+module_out:
 	cpufreq_debug_enable_ratelimit();
 	return ret;
 }
@@ -762,7 +768,6 @@
 	down(&data->lock);
 	if (cpufreq_driver->target)
 		__cpufreq_governor(data, CPUFREQ_GOV_STOP);
-	cpufreq_driver->target = NULL;
 	up(&data->lock);
 
 	kobject_unregister(&data->kobj);
@@ -1109,17 +1114,30 @@
 			    unsigned int relation)
 {
 	int retval = -EINVAL;
-	lock_cpu_hotplug();
+
+	/*
+	 * Converted the lock_cpu_hotplug to preempt_disable()
+	 * and preempt_enable(). This is a bit kludgy and relies on how cpu
+	 * hotplug works. All we need is a guarantee that cpu hotplug won't make
+	 * progress on any cpu. Once we do preempt_disable(), this would ensure
+	 * that hotplug threads don't get onto this cpu, thereby delaying
+	 * the cpu remove process.
+	 *
+	 * We removed the lock_cpu_hotplug since we need to call this function
+	 * via cpu hotplug callbacks, which result in locking the cpu hotplug
+	 * thread itself. Agree this is not very clean, cpufreq community
+	 * could improve this if required. - Ashok Raj <ashok.raj@intel.com>
+	 */
+	preempt_disable();
 	dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
 		target_freq, relation);
 	if (cpu_online(policy->cpu) && cpufreq_driver->target)
 		retval = cpufreq_driver->target(policy, target_freq, relation);
-	unlock_cpu_hotplug();
+	preempt_enable();
 	return retval;
 }
 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
 
-
 int cpufreq_driver_target(struct cpufreq_policy *policy,
 			  unsigned int target_freq,
 			  unsigned int relation)
@@ -1406,6 +1424,45 @@
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
+static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct cpufreq_policy *policy;
+	struct sys_device *sys_dev;
+
+	sys_dev = get_cpu_sysdev(cpu);
+
+	if (sys_dev) {
+		switch (action) {
+		case CPU_ONLINE:
+			cpufreq_add_dev(sys_dev);
+			break;
+		case CPU_DOWN_PREPARE:
+			/*
+			 * We attempt to put this cpu in lowest frequency
+			 * possible before going down. This will permit
+			 * hardware-managed P-State to switch other related
+			 * threads to min or higher speeds if possible.
+			 */
+			policy = cpufreq_cpu_data[cpu];
+			if (policy) {
+				cpufreq_driver_target(policy, policy->min,
+						CPUFREQ_RELATION_H);
+			}
+			break;
+		case CPU_DEAD:
+			cpufreq_remove_dev(sys_dev);
+			break;
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_cpu_notifier =
+{
+    .notifier_call = cpufreq_cpu_callback,
+};
 
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
@@ -1466,6 +1523,7 @@
 	}
 
 	if (!ret) {
+		register_cpu_notifier(&cpufreq_cpu_notifier);
 		dprintk("driver %s up and running\n", driver_data->name);
 		cpufreq_debug_enable_ratelimit();
 	}
@@ -1497,6 +1555,7 @@
 	dprintk("unregistering driver %s\n", driver->name);
 
 	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
+	unregister_cpu_notifier(&cpufreq_cpu_notifier);
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;