drm/amdgpu: use ref to keep job alive

this is to fix fatal page fault error that occured if:
job is signaled/released after its timeout work is already
put to the global queue (in this case the cancel_delayed_work
will return false), which will lead to NX-protection error
page fault during job_timeout_func.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index b7e8071..639c70d 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -333,7 +333,8 @@
 	struct amd_gpu_scheduler *sched = s_job->sched;
 
 	if (sched->timeout != MAX_SCHEDULE_TIMEOUT) {
-		cancel_delayed_work(&s_job->work_tdr); /*TODO: how to deal the case that tdr is running */
+		if (cancel_delayed_work(&s_job->work_tdr))
+			amd_sched_job_put(s_job);
 
 		/* queue TDR for next job */
 		next = list_first_entry_or_null(&sched->ring_mirror_list,
@@ -341,6 +342,7 @@
 
 		if (next) {
 			INIT_DELAYED_WORK(&next->work_tdr, s_job->timeout_callback);
+			amd_sched_job_get(next);
 			schedule_delayed_work(&next->work_tdr, sched->timeout);
 		}
 	}
@@ -354,6 +356,7 @@
 		list_first_entry_or_null(&sched->ring_mirror_list, struct amd_sched_job, node) == s_job)
 	{
 		INIT_DELAYED_WORK(&s_job->work_tdr, s_job->timeout_callback);
+		amd_sched_job_get(s_job);
 		schedule_delayed_work(&s_job->work_tdr, sched->timeout);
 	}
 }
@@ -382,9 +385,11 @@
 						struct amd_gpu_scheduler *sched,
 						struct amd_sched_entity *entity,
 						void (*timeout_cb)(struct work_struct *work),
+						void (*free_cb)(struct kref *refcount),
 						void *owner, struct fence **fence)
 {
 	INIT_LIST_HEAD(&job->node);
+	kref_init(&job->refcount);
 	job->sched = sched;
 	job->s_entity = entity;
 	job->s_fence = amd_sched_fence_create(entity, owner);
@@ -393,6 +398,7 @@
 
 	job->s_fence->s_job = job;
 	job->timeout_callback = timeout_cb;
+	job->free_callback = free_cb;
 
 	if (fence)
 		*fence = &job->s_fence->base;
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index a5700ad..95ebfd0 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -78,6 +78,7 @@
 };
 
 struct amd_sched_job {
+	struct kref refcount;
 	struct amd_gpu_scheduler        *sched;
 	struct amd_sched_entity         *s_entity;
 	struct amd_sched_fence          *s_fence;
@@ -87,6 +88,7 @@
 	struct list_head			   node;
 	struct delayed_work work_tdr;
 	void (*timeout_callback) (struct work_struct *work);
+	void (*free_callback)(struct kref *refcount);
 };
 
 extern const struct fence_ops amd_sched_fence_ops;
@@ -155,9 +157,20 @@
 					struct amd_gpu_scheduler *sched,
 					struct amd_sched_entity *entity,
 					void (*timeout_cb)(struct work_struct *work),
+					void (*free_cb)(struct kref* refcount),
 					void *owner, struct fence **fence);
 void amd_sched_job_pre_schedule(struct amd_gpu_scheduler *sched ,
 								struct amd_sched_job *s_job);
 void amd_sched_job_finish(struct amd_sched_job *s_job);
 void amd_sched_job_begin(struct amd_sched_job *s_job);
+static inline void amd_sched_job_get(struct amd_sched_job *job) {
+	if (job)
+		kref_get(&job->refcount);
+}
+
+static inline void amd_sched_job_put(struct amd_sched_job *job) {
+	if (job)
+		kref_put(&job->refcount, job->free_callback);
+}
+
 #endif