drm/amdgpu: use ref to keep job alive
this is to fix fatal page fault error that occured if:
job is signaled/released after its timeout work is already
put to the global queue (in this case the cancel_delayed_work
will return false), which will lead to NX-protection error
page fault during job_timeout_func.
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index a5700ad..95ebfd0 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -78,6 +78,7 @@
};
struct amd_sched_job {
+ struct kref refcount;
struct amd_gpu_scheduler *sched;
struct amd_sched_entity *s_entity;
struct amd_sched_fence *s_fence;
@@ -87,6 +88,7 @@
struct list_head node;
struct delayed_work work_tdr;
void (*timeout_callback) (struct work_struct *work);
+ void (*free_callback)(struct kref *refcount);
};
extern const struct fence_ops amd_sched_fence_ops;
@@ -155,9 +157,20 @@
struct amd_gpu_scheduler *sched,
struct amd_sched_entity *entity,
void (*timeout_cb)(struct work_struct *work),
+ void (*free_cb)(struct kref* refcount),
void *owner, struct fence **fence);
void amd_sched_job_pre_schedule(struct amd_gpu_scheduler *sched ,
struct amd_sched_job *s_job);
void amd_sched_job_finish(struct amd_sched_job *s_job);
void amd_sched_job_begin(struct amd_sched_job *s_job);
+static inline void amd_sched_job_get(struct amd_sched_job *job) {
+ if (job)
+ kref_get(&job->refcount);
+}
+
+static inline void amd_sched_job_put(struct amd_sched_job *job) {
+ if (job)
+ kref_put(&job->refcount, job->free_callback);
+}
+
#endif