drm/amdgpu: use ref to keep job alive

this is to fix fatal page fault error that occured if:
job is signaled/released after its timeout work is already
put to the global queue (in this case the cancel_delayed_work
will return false), which will lead to NX-protection error
page fault during job_timeout_func.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index ccb2846..2ac07eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -750,7 +750,9 @@
 		     struct amdgpu_job **job);
 int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
 			     struct amdgpu_job **job);
+
 void amdgpu_job_free(struct amdgpu_job *job);
+void amdgpu_job_free_func(struct kref *refcount);
 int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
 		      struct amd_sched_entity *entity, void *owner,
 		      struct fence **f);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 9025671..d7e0b0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -872,6 +872,7 @@
 	r = amd_sched_job_init(&job->base, &ring->sched,
 						&p->ctx->rings[ring->idx].entity,
 						amdgpu_job_timeout_func,
+						amdgpu_job_free_func,
 						p->filp, &fence);
 	if (r) {
 		amdgpu_job_free(job);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 961cae4..a052ac2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -31,7 +31,7 @@
 static void amdgpu_job_free_handler(struct work_struct *ws)
 {
 	struct amdgpu_job *job = container_of(ws, struct amdgpu_job, base.work_free_job);
-	kfree(job);
+	amd_sched_job_put(&job->base);
 }
 
 void amdgpu_job_timeout_func(struct work_struct *work)
@@ -41,6 +41,8 @@
 				job->base.sched->name,
 				(uint32_t)atomic_read(&job->ring->fence_drv.last_seq),
 				job->ring->fence_drv.sync_seq);
+
+	amd_sched_job_put(&job->base);
 }
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
@@ -101,6 +103,12 @@
 		kfree(job);
 }
 
+void amdgpu_job_free_func(struct kref *refcount)
+{
+	struct amdgpu_job *job = container_of(refcount, struct amdgpu_job, base.refcount);
+	kfree(job);
+}
+
 int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
 		      struct amd_sched_entity *entity, void *owner,
 		      struct fence **f)
@@ -113,9 +121,10 @@
 		return -EINVAL;
 
 	r = amd_sched_job_init(&job->base, &ring->sched,
-							entity, owner,
+							entity,
 							amdgpu_job_timeout_func,
-							&fence);
+							amdgpu_job_free_func,
+							owner, &fence);
 	if (r)
 		return r;