summaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorChristian König <christian.koenig@amd.com>2020-08-12 17:48:26 +0200
committerAlex Deucher <alexander.deucher@amd.com>2020-08-14 16:22:40 -0400
commitf1403342ebdfcff3c3cf57ae476f19d3078f2767 (patch)
treed94e6a6c652ebc0688fdb0c57587712c75970472 /drivers/gpu
parent05f39286ce11b98376e0d179aff0d537c257e772 (diff)
drm/amdgpu: revert "fix system hang issue during GPU reset"
The whole approach wasn't thought through till the end. We already had a reset lock like this in the past and it caused the same problems like this one. Completely revert the patch for now and add individual trylock protection to the hardware access functions as necessary. This reverts commit df9c8d1aa278c435c30a69b8f2418b4a52fcb929. Signed-off-by: Christian König <christian.koenig@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c40
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c57
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_job.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c353
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c11
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/atom.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c13
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c13
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c16
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c4
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c4
-rw-r--r--drivers/gpu/drm/amd/powerplay/amdgpu_smu.c2
-rw-r--r--drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c2
39 files changed, 184 insertions, 469 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 3e82a11577d9..08f80ca3b296 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -949,9 +949,9 @@ struct amdgpu_device {
bool in_suspend;
bool in_hibernate;
- atomic_t in_gpu_reset;
+ bool in_gpu_reset;
enum pp_mp1_state mp1_state;
- struct rw_semaphore reset_sem;
+ struct mutex lock_reset;
struct amdgpu_doorbell_index doorbell_index;
struct mutex notifier_lock;
@@ -1266,9 +1266,4 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
return adev->gmc.tmz_enabled;
}
-static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
-{
- return atomic_read(&adev->in_gpu_reset) ? true : false;
-}
-
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 9738dccb1c2c..0effc1d46824 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -244,14 +244,11 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
if (cp_mqd_gfx9)
bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
- if (!down_read_trylock(&adev->reset_sem))
- return -EIO;
-
r = amdgpu_bo_create(adev, &bp, &bo);
if (r) {
dev_err(adev->dev,
"failed to allocate BO for amdkfd (%d)\n", r);
- goto err;
+ return r;
}
/* map the buffer */
@@ -286,7 +283,6 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
amdgpu_bo_unreserve(bo);
- up_read(&adev->reset_sem);
return 0;
allocate_mem_kmap_bo_failed:
@@ -295,25 +291,19 @@ allocate_mem_pin_bo_failed:
amdgpu_bo_unreserve(bo);
allocate_mem_reserve_bo_failed:
amdgpu_bo_unref(&bo);
-err:
- up_read(&adev->reset_sem);
+
return r;
}
void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
{
- struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
- down_read(&adev->reset_sem);
-
amdgpu_bo_reserve(bo, true);
amdgpu_bo_kunmap(bo);
amdgpu_bo_unpin(bo);
amdgpu_bo_unreserve(bo);
amdgpu_bo_unref(&(bo));
-
- up_read(&adev->reset_sem);
}
int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
@@ -345,14 +335,9 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj)
{
- struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj;
- down_read(&adev->reset_sem);
-
amdgpu_bo_unref(&bo);
-
- up_read(&adev->reset_sem);
}
uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
@@ -626,15 +611,8 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
/* This works for NO_HWS. TODO: need to handle without knowing VMID */
job->vmid = vmid;
- if (!down_read_trylock(&adev->reset_sem)) {
- ret = -EIO;
- goto err_ib_sched;
- }
-
ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
- up_read(&adev->reset_sem);
-
if (ret) {
DRM_ERROR("amdgpu: failed to schedule IB.\n");
goto err_ib_sched;
@@ -670,9 +648,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
{
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
- if (!down_read_trylock(&adev->reset_sem))
- return -EIO;
-
if (adev->family == AMDGPU_FAMILY_AI) {
int i;
@@ -682,8 +657,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
}
- up_read(&adev->reset_sem);
-
return 0;
}
@@ -692,18 +665,11 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
const uint32_t flush_type = 0;
bool all_hub = false;
- int ret = -EIO;
if (adev->family == AMDGPU_FAMILY_AI)
all_hub = true;
- if (down_read_trylock(&adev->reset_sem)) {
- ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev,
- pasid, flush_type, all_hub);
- up_read(&adev->reset_sem);
- }
-
- return ret;
+ return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
}
bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index b0dcc800251e..bf927f432506 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -542,7 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
- if (amdgpu_in_reset(adev))
+ if (adev->in_gpu_reset)
return -EIO;
#if 0
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 275f20399373..744366c7ee85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -423,7 +423,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long flags, end_jiffies;
int retry;
- if (amdgpu_in_reset(adev))
+ if (adev->in_gpu_reset)
return -EIO;
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index 4997189d8b36..feab4cc6e836 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -419,7 +419,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry;
struct vi_mqd *m = get_mqd(mqd);
- if (amdgpu_in_reset(adev))
+ if (adev->in_gpu_reset)
return -EIO;
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index d5d997fe6aa4..e4c274bd35c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -539,7 +539,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
- if (amdgpu_in_reset(adev))
+ if (adev->in_gpu_reset)
return -EIO;
acquire_queue(kgd, pipe_id, queue_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index fcf72f337785..62cb510e2cc4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1194,9 +1194,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
return -EINVAL;
}
- if (!down_read_trylock(&adev->reset_sem))
- return -EIO;
-
*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
if (!*mem) {
ret = -ENOMEM;
@@ -1263,7 +1260,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
if (offset)
*offset = amdgpu_bo_mmap_offset(bo);
- up_read(&adev->reset_sem);
return 0;
allocate_init_user_pages_failed:
@@ -1281,9 +1277,6 @@ err:
sg_free_table(sg);
kfree(sg);
}
-
- up_read(&adev->reset_sem);
-
return ret;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index a3b150304dae..a512ccbc4dea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1292,8 +1292,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
parser.adev = adev;
parser.filp = filp;
- down_read(&adev->reset_sem);
-
r = amdgpu_cs_parser_init(&parser, data);
if (r) {
DRM_ERROR("Failed to initialize parser %d!\n", r);
@@ -1333,8 +1331,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
out:
amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
- up_read(&adev->reset_sem);
-
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index d85d13f7a043..8842c55d4490 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -358,8 +358,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
if (atomic_read(&ctx->guilty))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
- down_read(&adev->reset_sem);
-
/*query ue count*/
ras_counter = amdgpu_ras_query_error_count(adev, false);
/*ras counter is monotonic increasing*/
@@ -375,8 +373,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
ctx->ras_counter_ce = ras_counter;
}
- up_read(&adev->reset_sem);
-
mutex_unlock(&mgr->lock);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 0af249a1e35b..35fed75a4397 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -101,14 +101,14 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
file->private_data = adev;
- down_read(&adev->reset_sem);
+ mutex_lock(&adev->lock_reset);
if (adev->autodump.dumping.done) {
reinit_completion(&adev->autodump.dumping);
ret = 0;
} else {
ret = -EBUSY;
}
- up_read(&adev->reset_sem);
+ mutex_unlock(&adev->lock_reset);
return ret;
}
@@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_
poll_wait(file, &adev->autodump.gpu_hang, poll_table);
- if (amdgpu_in_reset(adev))
+ if (adev->in_gpu_reset)
return POLLIN | POLLRDNORM | POLLWRNORM;
return 0;
@@ -1242,7 +1242,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
}
/* Avoid accidently unparking the sched thread during GPU reset */
- down_read(&adev->reset_sem);
+ mutex_lock(&adev->lock_reset);
/* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -1269,7 +1269,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
kthread_unpark(ring->sched.thread);
}
- up_read(&adev->reset_sem);
+ mutex_unlock(&adev->lock_reset);
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1459,7 +1459,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
/* Avoid accidently unparking the sched thread during GPU reset */
- down_read(&adev->reset_sem);
+ mutex_lock(&adev->lock_reset);
/* stop the scheduler */
kthread_park(ring->sched.thread);
@@ -1500,7 +1500,7 @@ failure:
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
- up_read(&adev->reset_sem);
+ mutex_unlock(&adev->lock_reset);
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bb7f0c8611f9..415e1a32b98c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
if (adev->ip_blocks[i].status.hw == true)
break;
- if (amdgpu_in_reset(adev) || adev->in_suspend) {
+ if (adev->in_gpu_reset || adev->in_suspend) {
r = adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> failed %d\n",
@@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
AMDGPU_RESET_MAGIC_NUM))
return true;
- if (!amdgpu_in_reset(adev))
+ if (!adev->in_gpu_reset)
return false;
/*
@@ -3053,8 +3053,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
- init_rwsem(&adev->reset_sem);
- atomic_set(&adev->in_gpu_reset, 0);
+ mutex_init(&adev->lock_reset);
mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock);
@@ -4082,11 +4081,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) {
/* post card */
- if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) {
- dev_warn(tmp_adev->dev, "asic atom init failed!");
- r = -EAGAIN;
- goto out;
- }
+ if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
+ DRM_WARN("asic atom init failed!");
if (!r) {
dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
@@ -4176,18 +4172,16 @@ end:
return r;
}
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
+static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
{
- if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
- return false;
-
- if (hive) {
- down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
- } else {
- down_write(&adev->reset_sem);
- }
+ if (trylock) {
+ if (!mutex_trylock(&adev->lock_reset))
+ return false;
+ } else
+ mutex_lock(&adev->lock_reset);
atomic_inc(&adev->gpu_reset_counter);
+ adev->in_gpu_reset = true;
switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1:
adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -4207,8 +4201,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
{
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
- atomic_set(&adev->in_gpu_reset, 0);
- up_write(&adev->reset_sem);
+ adev->in_gpu_reset = false;
+ mutex_unlock(&adev->lock_reset);
}
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@@ -4318,14 +4312,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* We always reset all schedulers for device and all devices for XGMI
* hive so that should take care of them too.
*/
- hive = amdgpu_get_xgmi_hive(adev, false);
- if (hive) {
- if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
- DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
- job ? job->base.id : -1, hive->hive_id);
- return 0;
- }
- mutex_lock(&hive->hive_lock);
+ hive = amdgpu_get_xgmi_hive(adev, true);
+ if (hive && !mutex_trylock(&hive->reset_lock)) {
+ DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
+ job ? job->base.id : -1, hive->hive_id);
+ mutex_unlock(&hive->hive_lock);
+ return 0;
}
/*
@@ -4347,11 +4339,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
+ if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
job ? job->base.id : -1);
- r = 0;
- goto skip_recovery;
+ mutex_unlock(&hive->hive_lock);
+ return 0;
}
/*
@@ -4484,9 +4476,8 @@ skip_sched_resume:
amdgpu_device_unlock_adev(tmp_adev);
}
-skip_recovery:
if (hive) {
- atomic_set(&hive->in_reset, 0);
+ mutex_unlock(&hive->reset_lock);
mutex_unlock(&hive->hive_lock);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 73cc68ab53d0..7f9e50247413 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -671,8 +671,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
bo_va = NULL;
}
- down_read(&adev->reset_sem);
-
switch (args->operation) {
case AMDGPU_VA_OP_MAP:
va_flags = amdgpu_gem_va_map_flags(adev, args->flags);
@@ -702,8 +700,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va,
args->operation);
- up_read(&adev->reset_sem);
-
error_backoff:
ttm_eu_backoff_reservation(&ticket, &list);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 8ccd17d02cc6..a819360a4b6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -719,7 +719,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
*
* also don't wait anymore for IRQ context
* */
- if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
+ if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
goto failed_kiq_read;
might_sleep();
@@ -777,7 +777,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
*
* also don't wait anymore for IRQ context
* */
- if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
+ if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
goto failed_kiq_write;
might_sleep();
@@ -796,5 +796,5 @@ failed_undo:
amdgpu_ring_undo(ring);
spin_unlock_irqrestore(&kiq->ring_lock, flags);
failed_kiq_write:
- dev_warn(adev->dev, "failed to write reg:%x\n", reg);
+ pr_err("failed to write reg:%x\n", reg);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 75d37dfb51aa..937029ad5271 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -220,17 +220,17 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
trace_amdgpu_sched_run_job(job);
- if (down_read_trylock(&ring->adev->reset_sem)) {
+ if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter))
+ dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
+
+ if (finished->error < 0) {
+ DRM_INFO("Skip scheduling IBs!\n");
+ } else {
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
- &fence);
- up_read(&ring->adev->reset_sem);
+ &fence);
if (r)
DRM_ERROR("Error scheduling IBs (%d)\n", r);
- } else {
- dma_fence_set_error(finished, -ECANCELED);
- DRM_INFO("Skip scheduling IBs!\n");
}
-
/* if gpu reset, hw fence will be replaced here */
dma_fence_put(job->fence);
job->fence = dma_fence_get(fence);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 58580a48b648..7619f1c3084d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1087,8 +1087,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
if (!fpriv)
return;
- down_read(&adev->reset_sem);
-
pm_runtime_get_sync(dev->dev);
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_UVD) != NULL)
@@ -1127,8 +1125,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
-
- up_read(&adev->reset_sem);
}
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index 1705e328c6fc..65ad174bb976 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -163,7 +163,7 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
enum amd_pm_state_type pm;
int ret;
- if (amdgpu_in_reset(adev))
+ if (adev->in_gpu_reset)
return -EPERM;
ret = pm_runtime_get_sync(ddev->dev);
@@ -172,8 +172,6 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
return ret;
}
- down_read(&adev->reset_sem);
-
if (is_support_sw_smu(adev)) {
if (adev->smu.ppt_funcs->get_current_power_state)
pm = smu_get_current_power_state(&adev->smu);
@@ -185,8 +183,6 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
pm = adev->pm.dpm.user_state;
}
- up_read(&adev->reset_sem);
-
pm_runtime_mark_last_busy(ddev->dev);
pm_runtime_put_autosuspend(ddev->dev);
@@ -205,7 +201,7 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
enum amd_pm_state_type state;
int ret;
- if (amdgpu_in_reset(adev))
+ if (adev->in_gpu_reset)
return -EPERM;
if (strncmp("battery", buf, strlen("battery")) == 0)
@@ -223,8 +219,6 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
return ret;
}
- down_read(&adev->reset_sem);
-
if (is_support_sw_smu(adev)) {
mutex_lock(&adev->pm.mutex);
adev->pm.dpm.user_state = state;
@@ -238,9 +232,6 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
amdgpu_pm_compute_clocks(adev);
}
-
- up_read(&adev->reset_sem);
-
pm_runtime_mark_last_busy(ddev->dev);
pm_runtime_put_autosuspend(ddev->dev);
@@ -316,7 +307,7 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
enum amd_dpm_forced_level level = 0xff;
int ret;
- if (amdgpu_in_reset(adev))
+ if (adev->in_gpu_reset)
return -EPERM;
ret = pm_runtime_get_sync(ddev->dev);
@@ -325,8 +316,6 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
return ret;
}
- down_read(&adev->reset_sem);
-
if (is_support_sw_smu(adev))
level = smu_get_performance_level(&adev->smu);
else if (adev->powerplay.pp_funcs->get_performance_level)
@@ -334,8 +323,6 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
else
level = adev->pm.dpm.forced_level;
- up_read(&adev->reset_sem);
-
pm_runtime_mark_last_busy(ddev->dev);
pm_runtime_put_autosuspend(ddev->dev);
@@ -362,7 +349,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
enum amd_dpm_forced_level current_level = 0xff;
int ret = 0;
- if (amdgpu_in_reset(adev))
+ if (adev->in_gpu_reset)
return -EPERM;
if (strncmp("low", buf, strlen("low")) == 0) {
@@ -393,8 +380,6 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
return ret;
}
- down_read(&adev->reset_sem);
-
if (is_support_sw_smu(adev))
current_level = smu_get_performance_level(&adev->smu);
else if (adev->powerplay.pp_funcs->get_performance_level)
@@ -403,8 +388,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
if (current_level == level) {
pm_runtime_mark_last_busy(ddev->dev);
pm_runtime_put_autosuspend(ddev->dev);
- ret = count;
- goto pro_end;
+ return count;
}
if (adev->asic_type == CHIP_RAVEN) {
@@ -425,8 +409,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
pr_err("Currently not in any profile mode!\n");
pm_runtime_mark_last_busy(ddev->dev);
pm_runtime_put_autosuspend(ddev->dev);
- ret = -EINVAL;
- goto pro_end;
+ return -EINVAL;
}
if (is_support_sw_smu(adev)) {
@@ -434,8 +417,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
if (ret) {
pm_runtime_mark_last_busy(ddev->dev);
pm_runtime_put_autosuspend(ddev->dev);
- ret = -EIN