summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2019-04-24 18:48:39 +0100
committerChris Wilson <chris@chris-wilson.co.uk>2019-04-24 21:01:46 +0100
commit112ed2d31a46f4704085ad925435b77e62b8abee (patch)
treed099b0a6d7989a1f60810bbf82b2966439eb490c /drivers/gpu/drm/i915/gt/selftest_hangcheck.c
parent86554f48e511faa58f729cc077b1733179882804 (diff)
drm/i915: Move GraphicsTechnology files under gt/
Start partitioning off the code that talks to the hardware (GT) from the uapi layers and move the device facing code under gt/ One casualty is s/intel_ringbuffer.h/intel_engine.h/ with the plan to subdivide that header and body further (and split out the submission code from the ringbuffer and logical context handling). This patch aims to be simple motion so git can fixup inflight patches with little mess. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Acked-by: Jani Nikula <jani.nikula@intel.com> Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190424174839.7141-1-chris@chris-wilson.co.uk
Diffstat (limited to 'drivers/gpu/drm/i915/gt/selftest_hangcheck.c')
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_hangcheck.c1919
1 files changed, 1919 insertions, 0 deletions
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
new file mode 100644
index 000000000000..acd33aa46068
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -0,0 +1,1919 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/kthread.h>
+
+#include "i915_selftest.h"
+#include "selftests/i915_random.h"
+#include "selftests/igt_flush_test.h"
+#include "selftests/igt_reset.h"
+#include "selftests/igt_wedge_me.h"
+
+#include "selftests/mock_context.h"
+#include "selftests/mock_drm.h"
+
+#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
+
+struct hang {
+ struct drm_i915_private *i915;
+ struct drm_i915_gem_object *hws;
+ struct drm_i915_gem_object *obj;
+ struct i915_gem_context *ctx;
+ u32 *seqno;
+ u32 *batch;
+};
+
+static int hang_init(struct hang *h, struct drm_i915_private *i915)
+{
+ void *vaddr;
+ int err;
+
+ memset(h, 0, sizeof(*h));
+ h->i915 = i915;
+
+ h->ctx = kernel_context(i915);
+ if (IS_ERR(h->ctx))
+ return PTR_ERR(h->ctx);
+
+ GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
+
+ h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
+ if (IS_ERR(h->hws)) {
+ err = PTR_ERR(h->hws);
+ goto err_ctx;
+ }
+
+ h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
+ if (IS_ERR(h->obj)) {
+ err = PTR_ERR(h->obj);
+ goto err_hws;
+ }
+
+ i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
+ vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
+ if (IS_ERR(vaddr)) {
+ err = PTR_ERR(vaddr);
+ goto err_obj;
+ }
+ h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
+
+ vaddr = i915_gem_object_pin_map(h->obj,
+ i915_coherent_map_type(i915));
+ if (IS_ERR(vaddr)) {
+ err = PTR_ERR(vaddr);
+ goto err_unpin_hws;
+ }
+ h->batch = vaddr;
+
+ return 0;
+
+err_unpin_hws:
+ i915_gem_object_unpin_map(h->hws);
+err_obj:
+ i915_gem_object_put(h->obj);
+err_hws:
+ i915_gem_object_put(h->hws);
+err_ctx:
+ kernel_context_close(h->ctx);
+ return err;
+}
+
+static u64 hws_address(const struct i915_vma *hws,
+ const struct i915_request *rq)
+{
+ return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
+}
+
+static int move_to_active(struct i915_vma *vma,
+ struct i915_request *rq,
+ unsigned int flags)
+{
+ int err;
+
+ err = i915_vma_move_to_active(vma, rq, flags);
+ if (err)
+ return err;
+
+ if (!i915_gem_object_has_active_reference(vma->obj)) {
+ i915_gem_object_get(vma->obj);
+ i915_gem_object_set_active_reference(vma->obj);
+ }
+
+ return 0;
+}
+
+static struct i915_request *
+hang_create_request(struct hang *h, struct intel_engine_cs *engine)
+{
+ struct drm_i915_private *i915 = h->i915;
+ struct i915_address_space *vm =
+ h->ctx->ppgtt ? &h->ctx->ppgtt->vm : &i915->ggtt.vm;
+ struct i915_request *rq = NULL;
+ struct i915_vma *hws, *vma;
+ unsigned int flags;
+ u32 *batch;
+ int err;
+
+ if (i915_gem_object_is_active(h->obj)) {
+ struct drm_i915_gem_object *obj;
+ void *vaddr;
+
+ obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
+ if (IS_ERR(obj))
+ return ERR_CAST(obj);
+
+ vaddr = i915_gem_object_pin_map(obj,
+ i915_coherent_map_type(h->i915));
+ if (IS_ERR(vaddr)) {
+ i915_gem_object_put(obj);
+ return ERR_CAST(vaddr);
+ }
+
+ i915_gem_object_unpin_map(h->obj);
+ i915_gem_object_put(h->obj);
+
+ h->obj = obj;
+ h->batch = vaddr;
+ }
+
+ vma = i915_vma_instance(h->obj, vm, NULL);
+ if (IS_ERR(vma))
+ return ERR_CAST(vma);
+
+ hws = i915_vma_instance(h->hws, vm, NULL);
+ if (IS_ERR(hws))
+ return ERR_CAST(hws);
+
+ err = i915_vma_pin(vma, 0, 0, PIN_USER);
+ if (err)
+ return ERR_PTR(err);
+
+ err = i915_vma_pin(hws, 0, 0, PIN_USER);
+ if (err)
+ goto unpin_vma;
+
+ rq = i915_request_alloc(engine, h->ctx);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto unpin_hws;
+ }
+
+ err = move_to_active(vma, rq, 0);
+ if (err)
+ goto cancel_rq;
+
+ err = move_to_active(hws, rq, 0);
+ if (err)
+ goto cancel_rq;
+
+ batch = h->batch;
+ if (INTEL_GEN(i915) >= 8) {
+ *batch++ = MI_STORE_DWORD_IMM_GEN4;
+ *batch++ = lower_32_bits(hws_address(hws, rq));
+ *batch++ = upper_32_bits(hws_address(hws, rq));
+ *batch++ = rq->fence.seqno;
+ *batch++ = MI_ARB_CHECK;
+
+ memset(batch, 0, 1024);
+ batch += 1024 / sizeof(*batch);
+
+ *batch++ = MI_ARB_CHECK;
+ *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
+ *batch++ = lower_32_bits(vma->node.start);
+ *batch++ = upper_32_bits(vma->node.start);
+ } else if (INTEL_GEN(i915) >= 6) {
+ *batch++ = MI_STORE_DWORD_IMM_GEN4;
+ *batch++ = 0;
+ *batch++ = lower_32_bits(hws_address(hws, rq));
+ *batch++ = rq->fence.seqno;
+ *batch++ = MI_ARB_CHECK;
+
+ memset(batch, 0, 1024);
+ batch += 1024 / sizeof(*batch);
+
+ *batch++ = MI_ARB_CHECK;
+ *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
+ *batch++ = lower_32_bits(vma->node.start);
+ } else if (INTEL_GEN(i915) >= 4) {
+ *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+ *batch++ = 0;
+ *batch++ = lower_32_bits(hws_address(hws, rq));
+ *batch++ = rq->fence.seqno;
+ *batch++ = MI_ARB_CHECK;
+
+ memset(batch, 0, 1024);
+ batch += 1024 / sizeof(*batch);
+
+ *batch++ = MI_ARB_CHECK;
+ *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
+ *batch++ = lower_32_bits(vma->node.start);
+ } else {
+ *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+ *batch++ = lower_32_bits(hws_address(hws, rq));
+ *batch++ = rq->fence.seqno;
+ *batch++ = MI_ARB_CHECK;
+
+ memset(batch, 0, 1024);
+ batch += 1024 / sizeof(*batch);
+
+ *batch++ = MI_ARB_CHECK;
+ *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
+ *batch++ = lower_32_bits(vma->node.start);
+ }
+ *batch++ = MI_BATCH_BUFFER_END; /* not reached */
+ i915_gem_chipset_flush(h->i915);
+
+ if (rq->engine->emit_init_breadcrumb) {
+ err = rq->engine->emit_init_breadcrumb(rq);
+ if (err)
+ goto cancel_rq;
+ }
+
+ flags = 0;
+ if (INTEL_GEN(vm->i915) <= 5)
+ flags |= I915_DISPATCH_SECURE;
+
+ err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
+
+cancel_rq:
+ if (err) {
+ i915_request_skip(rq, err);
+ i915_request_add(rq);
+ }
+unpin_hws:
+ i915_vma_unpin(hws);
+unpin_vma:
+ i915_vma_unpin(vma);
+ return err ? ERR_PTR(err) : rq;
+}
+
+static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
+{
+ return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
+}
+
+static void hang_fini(struct hang *h)
+{
+ *h->batch = MI_BATCH_BUFFER_END;
+ i915_gem_chipset_flush(h->i915);
+
+ i915_gem_object_unpin_map(h->obj);
+ i915_gem_object_put(h->obj);
+
+ i915_gem_object_unpin_map(h->hws);
+ i915_gem_object_put(h->hws);
+
+ kernel_context_close(h->ctx);
+
+ igt_flush_test(h->i915, I915_WAIT_LOCKED);
+}
+
+static bool wait_until_running(struct hang *h, struct i915_request *rq)
+{
+ return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
+ rq->fence.seqno),
+ 10) &&
+ wait_for(i915_seqno_passed(hws_seqno(h, rq),
+ rq->fence.seqno),
+ 1000));
+}
+
+static int igt_hang_sanitycheck(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct i915_request *rq;
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+ struct hang h;
+ int err;
+
+ /* Basic check that we can execute our hanging batch */
+
+ mutex_lock(&i915->drm.struct_mutex);
+ err = hang_init(&h, i915);
+ if (err)
+ goto unlock;
+
+ for_each_engine(engine, i915, id) {
+ struct igt_wedge_me w;
+ long timeout;
+
+ if (!intel_engine_can_store_dword(engine))
+ continue;
+
+ rq = hang_create_request(&h, engine);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ pr_err("Failed to create request for %s, err=%d\n",
+ engine->name, err);
+ goto fini;
+ }
+
+ i915_request_get(rq);
+
+ *h.batch = MI_BATCH_BUFFER_END;
+ i915_gem_chipset_flush(i915);
+
+ i915_request_add(rq);
+
+ timeout = 0;
+ igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
+ timeout = i915_request_wait(rq,
+ I915_WAIT_LOCKED,
+ MAX_SCHEDULE_TIMEOUT);
+ if (i915_reset_failed(i915))
+ timeout = -EIO;
+
+ i915_request_put(rq);
+
+ if (timeout < 0) {
+ err = timeout;
+ pr_err("Wait for request failed on %s, err=%d\n",
+ engine->name, err);
+ goto fini;
+ }
+ }
+
+fini:
+ hang_fini(&h);
+unlock:
+ mutex_unlock(&i915->drm.struct_mutex);
+ return err;
+}
+
+static int igt_global_reset(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ unsigned int reset_count;
+ int err = 0;
+
+ /* Check that we can issue a global GPU reset */
+
+ igt_global_reset_lock(i915);
+
+ reset_count = i915_reset_count(&i915->gpu_error);
+
+ i915_reset(i915, ALL_ENGINES, NULL);
+
+ if (i915_reset_count(&i915->gpu_error) == reset_count) {
+ pr_err("No GPU reset recorded!\n");
+ err = -EINVAL;
+ }
+
+ igt_global_reset_unlock(i915);
+
+ if (i915_reset_failed(i915))
+ err = -EIO;
+
+ return err;
+}
+
+static int igt_wedged_reset(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ intel_wakeref_t wakeref;
+
+ /* Check that we can recover a wedged device with a GPU reset */
+
+ igt_global_reset_lock(i915);
+ wakeref = intel_runtime_pm_get(i915);
+
+ i915_gem_set_wedged(i915);
+
+ GEM_BUG_ON(!i915_reset_failed(i915));
+ i915_reset(i915, ALL_ENGINES, NULL);
+
+ intel_runtime_pm_put(i915, wakeref);
+ igt_global_reset_unlock(i915);
+
+ return i915_reset_failed(i915) ? -EIO : 0;
+}
+
+static bool wait_for_idle(struct intel_engine_cs *engine)
+{
+ return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
+}
+
+static int igt_reset_nop(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct i915_gem_context *ctx;
+ unsigned int reset_count, count;
+ enum intel_engine_id id;
+ intel_wakeref_t wakeref;
+ struct drm_file *file;
+ IGT_TIMEOUT(end_time);
+ int err = 0;
+
+ /* Check that we can reset during non-user portions of requests */
+
+ file = mock_file(i915);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ mutex_lock(&i915->drm.struct_mutex);
+ ctx = live_context(i915, file);
+ mutex_unlock(&i915->drm.struct_mutex);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto out;
+ }
+
+ i915_gem_context_clear_bannable(ctx);
+ wakeref = intel_runtime_pm_get(i915);
+ reset_count = i915_reset_count(&i915->gpu_error);
+ count = 0;
+ do {
+ mutex_lock(&i915->drm.struct_mutex);
+ for_each_engine(engine, i915, id) {
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_alloc(engine, ctx);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ break;
+ }
+
+ i915_request_add(rq);
+ }
+ }
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ igt_global_reset_lock(i915);
+ i915_reset(i915, ALL_ENGINES, NULL);
+ igt_global_reset_unlock(i915);
+ if (i915_reset_failed(i915)) {
+ err = -EIO;
+ break;
+ }
+
+ if (i915_reset_count(&i915->gpu_error) !=
+ reset_count + ++count) {
+ pr_err("Full GPU reset not recorded!\n");
+ err = -EINVAL;
+ break;
+ }
+
+ if (!i915_reset_flush(i915)) {
+ struct drm_printer p =
+ drm_info_printer(i915->drm.dev);
+
+ pr_err("%s failed to idle after reset\n",
+ engine->name);
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
+
+ err = -EIO;
+ break;
+ }
+
+ err = igt_flush_test(i915, 0);
+ if (err)
+ break;
+ } while (time_before(jiffies, end_time));
+ pr_info("%s: %d resets\n", __func__, count);
+
+ mutex_lock(&i915->drm.struct_mutex);
+ err = igt_flush_test(i915, I915_WAIT_LOCKED);
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ intel_runtime_pm_put(i915, wakeref);
+
+out:
+ mock_file_free(i915, file);
+ if (i915_reset_failed(i915))
+ err = -EIO;
+ return err;
+}
+
+static int igt_reset_nop_engine(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct i915_gem_context *ctx;
+ enum intel_engine_id id;
+ intel_wakeref_t wakeref;
+ struct drm_file *file;
+ int err = 0;
+
+ /* Check that we can engine-reset during non-user portions */
+
+ if (!intel_has_reset_engine(i915))
+ return 0;
+
+ file = mock_file(i915);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ mutex_lock(&i915->drm.struct_mutex);
+ ctx = live_context(i915, file);
+ mutex_unlock(&i915->drm.struct_mutex);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto out;
+ }
+
+ i915_gem_context_clear_bannable(ctx);
+ wakeref = intel_runtime_pm_get(i915);
+ for_each_engine(engine, i915, id) {
+ unsigned int reset_count, reset_engine_count;
+ unsigned int count;
+ IGT_TIMEOUT(end_time);
+
+ reset_count = i915_reset_count(&i915->gpu_error);
+ reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
+ engine);
+ count = 0;
+
+ set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+ do {
+ int i;
+
+ if (!wait_for_idle(engine)) {
+ pr_err("%s failed to idle before reset\n",
+ engine->name);
+ err = -EIO;
+ break;
+ }
+
+ mutex_lock(&i915->drm.struct_mutex);
+ for (i = 0; i < 16; i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_alloc(engine, ctx);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ break;
+ }
+
+ i915_request_add(rq);
+ }
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ err = i915_reset_engine(engine, NULL);
+ if (err) {
+ pr_err("i915_reset_engine failed\n");
+ break;
+ }
+
+ if (i915_reset_count(&i915->gpu_error) != reset_count) {
+ pr_err("Full GPU reset recorded! (engine reset expected)\n");
+ err = -EINVAL;
+ break;
+ }
+
+ if (i915_reset_engine_count(&i915->gpu_error, engine) !=
+ reset_engine_count + ++count) {
+ pr_err("%s engine reset not recorded!\n",
+ engine->name);
+ err = -EINVAL;
+ break;
+ }
+
+ if (!i915_reset_flush(i915)) {
+ struct drm_printer p =
+ drm_info_printer(i915->drm.dev);
+
+ pr_err("%s failed to idle after reset\n",
+ engine->name);
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
+
+ err = -EIO;
+ break;
+ }
+ } while (time_before(jiffies, end_time));
+ clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+ pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
+
+ if (err)
+ break;
+
+ err = igt_flush_test(i915, 0);
+ if (err)
+ break;
+ }
+
+ mutex_lock(&i915->drm.struct_mutex);
+ err = igt_flush_test(i915, I915_WAIT_LOCKED);
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ intel_runtime_pm_put(i915, wakeref);
+out:
+ mock_file_free(i915, file);
+ if (i915_reset_failed(i915))
+ err = -EIO;
+ return err;
+}
+
+static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
+{
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+ struct hang h;
+ int err = 0;
+
+ /* Check that we can issue an engine reset on an idle engine (no-op) */
+
+ if (!intel_has_reset_engine(i915))
+ return 0;
+
+ if (active) {
+ mutex_lock(&i915->drm.struct_mutex);
+ err = hang_init(&h, i915);
+ mutex_unlock(&i915->drm.struct_mutex);
+ if (err)
+ return err;
+ }
+
+ for_each_engine(engine, i915, id) {
+ unsigned int reset_count, reset_engine_count;
+ IGT_TIMEOUT(end_time);
+
+ if (active && !intel_engine_can_store_dword(engine))
+ continue;
+
+ if (!wait_for_idle(engine)) {
+ pr_err("%s failed to idle before reset\n",
+ engine->name);
+ err = -EIO;
+ break;
+ }
+
+ reset_count = i915_reset_count(&i915->gpu_error);
+ reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
+ engine);
+
+ set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+ do {
+ if (active) {
+ struct i915_request *rq;
+
+ mutex_lock(&i915->drm.struct_mutex);
+ rq = hang_create_request(&h, engine);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ mutex_unlock(&i915->drm.struct_mutex);
+ break;
+ }
+
+ i915_request_get(rq);
+ i915_request_add(rq);
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ if (!wait_until_running(&h, rq)) {
+ struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+ pr_err("%s: Failed to start request %llx, at %x\n",
+ __func__, rq->fence.seqno, hws_seqno(&h, rq));
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
+
+ i915_request_put(rq);
+ err = -EIO;
+ break;
+ }
+
+ i915_request_put(rq);
+ }
+
+ err = i915_reset_engine(engine, NULL);
+ if (err) {
+ pr_err("i915_reset_engine failed\n");
+ break;
+ }
+
+ if (i915_reset_count(&i915->gpu_error) != reset_count) {
+ pr_err("Full GPU reset recorded! (engine reset expected)\n");
+ err = -EINVAL;
+ break;
+ }
+
+ if (i915_reset_engine_count(&i915->gpu_error, engine) !=
+ ++reset_engine_count) {
+ pr_err("%s engine reset not recorded!\n",
+ engine->name);
+ err = -EINVAL;
+ break;
+ }
+
+ if (!i915_reset_flush(i915)) {
+ struct drm_printer p =
+ drm_info_printer(i915->drm.dev);
+
+ pr_err("%s failed to idle after reset\n",
+ engine->name);
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
+
+ err = -EIO;
+ break;
+ }
+ } while (time_before(jiffies, end_time));
+ clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+
+ if (err)
+ break;
+
+ err = igt_flush_test(i915, 0);
+ if (err)
+ break;
+ }
+
+ if (i915_reset_failed(i915))
+ err = -EIO;
+
+ if (active) {
+ mutex_lock(&i915->drm.struct_mutex);
+ hang_fini(&h);
+ mutex_unlock(&i915->drm.struct_mutex);
+ }
+
+ return err;
+}
+
+static int igt_reset_idle_engine(void *arg)
+{
+ return __igt_reset_engine(arg, false);
+}
+
+static int igt_reset_active_engine(void *arg)
+{
+ return __igt_reset_engine(arg, true);
+}
+
+struct active_engine {
+ struct task_struct *task;
+ struct intel_engine_cs *engine;
+ unsigned long resets;
+ unsigned int flags;
+};
+
+#define TEST_ACTIVE BIT(0)
+#define TEST_OTHERS BIT(1)
+#define TEST_SELF BIT(2)
+#define TEST_PRIORITY BIT(3)
+
+static int active_request_put(struct i915_request *rq)
+{
+ int err = 0;
+
+ if (!rq)
+ return 0;
+
+ if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
+ GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
+ rq->engine->name,
+ rq->fence.context,
+ rq->fence.seqno);
+ GEM_TRACE_DUMP();
+
+ i915_gem_set_wedged(rq->i915);
+ err = -EIO;
+ }
+
+ i915_request_put(rq);
+
+ return err;
+}
+
+static int active_engine(void *data)
+{
+ I915_RND_STATE(prng);
+ struct active_engine *arg = data;
+ struct intel_engine_cs *engine = arg->engine;
+ struct i915_request *rq[8] = {};
+ struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
+ struct drm_file *file;
+ unsigned long count = 0;
+ int err = 0;
+
+ file = mock_file(engine->i915);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ for (count = 0; count < ARRAY_SIZE(ctx); count++) {
+ mutex_lock(&engine->i915->drm.struct_mutex);
+ ctx[count] = live_context(engine->i915, file);
+ mutex_unlock(&engine->i915->drm.struct_mutex);
+ if (IS_ERR(ctx[count])) {
+ err = PTR_ERR(ctx[count]);
+ while (--count)
+ i915_gem_context_put(ctx[count]);
+ goto err_file;
+ }
+ }
+
+ while (!kthread_should_stop()) {
+ unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
+ struct i915_request *old = rq[idx];
+ struct i915_request *new;
+
+ mutex_lock(&engine->i915->drm.struct_mutex);
+ new = i915_request_alloc(engine, ctx[idx]);
+ if (IS_ERR(new)) {
+ mutex_unlock(&engine->i915->drm.struct_mutex);
+ err = PTR_ERR(new);
+ break;
+ }
+
+ if (arg->flags & TEST_PRIORITY)
+ ctx[idx]->sched.priority =
+ i915_prandom_u32_max_state(512, &prng);
+
+ rq[idx] = i915_request_get(new);
+ i915_request_add(new);
+ mutex_unlock(&engine->i915->drm.struct_mutex);
+
+ err = active_request_put(old);
+ if (err)
+ break;
+
+ cond_resched();
+ }
+
+ for (count = 0; count < ARRAY_SIZE(rq); count++) {
+ int err__ = active_request_put(rq[count]);
+
+ /* Keep the first error */
+ if (!err)
+ err = err__;
+ }
+
+err_file:
+ mock_file_free(engine->i915, file);
+ return err;
+}
+
+static int __igt_reset_engines(struct drm_i915_private *i915,
+ const char *test_name,
+ unsigned int flags)
+{
+ struct intel_engine_cs *engine, *other;
+ enum intel_engine_id id, tmp;
+ struct hang h;
+ int err = 0;
+
+ /* Check that issuing a reset on one engine does not interfere
+ * with any other engine.
+ */
+
+ if (!intel_has_reset_engine(i915))
+ return 0;
+
+ if (flags & TEST_ACTIVE) {
+ mutex_lock(&i915->drm.struct_mutex);
+ err = hang_init(&h, i915);
+ mutex_unlock(&i915->drm.struct_mutex);
+ if (err)
+ return err;
+
+ if (flags & TEST_PRIORITY)
+ h.ctx->sched.priority = 1024;
+ }
+
+ for_each_engine(engine, i915, id) {
+ struct active_engine threads[I915_NUM_ENGINES] = {};
+ unsigned long global = i915_reset_count(&i915->gpu_error);
+ unsigned long count = 0, reported;
+ IGT_TIMEOUT(end_time);
+
+ if (flags & TEST_ACTIVE &&
+ !intel_engine_can_store_dword(engine))
+ continue;
+
+ if (!wait_for_idle(engine)) {
+ pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
+ engine->name, test_name);
+ err = -EIO;
+ break;
+ }
+
+ memset(threads, 0, sizeof(threads));
+ for_each_engine(other, i915, tmp) {
+ struct task_struct *tsk;
+
+ threads[tmp].resets =
+ i915_reset_engine_count(&i915->gpu_error,
+ other);
+
+ if (!(flags & TEST_OTHERS))
+ continue;
+
+ if (other == engine && !(flags & TEST_SELF))
+ continue;
+
+ threads[tmp].engine = other;
+ threads[tmp].flags = flags;
+
+ tsk = kthread_run(active_engine, &threads[tmp],
+ "igt/%s", other->name);
+ if (IS_ERR(tsk)) {
+ err = PTR_ERR(tsk);
+ goto unwind;
+ }
+
+ threads[tmp].task = tsk;
+ get_task_struct(tsk);
+ }
+
+ set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+ do {
+ struct i915_request *rq = NULL;
+
+ if (flags & TEST_ACTIVE) {
+ mutex_lock(&i915->drm.struct_mutex);
+ rq = hang_create_request(&h, engine);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ mutex_unlock(&i915->drm.struct_mutex);
+ break;
+ }
+
+ i915_request_get(rq);
+ i915_request_add(rq);
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ if (!wait_until_running(&h, rq)) {
+ struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+ pr_err("%s: Failed to start request %llx, at %x\n",
+ __func__, rq->fence.seqno, hws_seqno(&h, rq));
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
+
+ i915_request_put(rq);
+ err = -EIO;
+ break;
+ }
+ }
+
+ err = i915_reset_engine(engine, NULL);
+ if (err) {
+ pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
+ engine->name, test_name, err);
+ break;
+ }
+
+ count++;
+
+ if (rq) {
+ if (i915_request_wait(rq, 0, HZ / 5) < 0) {
+ struct drm_printer p =
+ drm_info_printer(i915->drm.dev);
+
+ pr_err("i915_reset_engine(%s:%s):"
+ " failed to complete request after reset\n",
+ engine->name, test_name);
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
+ i915_request_put(rq);
+
+ GEM_TRACE_DUMP();
+ i915_gem_set_wedged(i915);
+ err = -EIO;
+ break;
+ }
+
+ i915_request_put(rq);
+ }
+
+ if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
+ struct drm_printer p =
+ drm_info_printer(i915->drm.dev);
+
+ pr_err("i915_reset_engine(%s:%s):"
+ " failed to idle after reset\n",
+ engine->name, test_name);
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
+
+ err = -EIO;
+ break;
+ }
+ } while (time_before(jiffies, end_time));
+ clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+ pr_info("i915_reset_engine(%s:%s): %lu resets\n",
+ engine->name, test_name, count);
+
+ reported = i915_reset_engine_count(&i915->gpu_error, engine);
+ reported -= threads[engine->id].resets;
+ if (reported != count) {
+ pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
+ engine->name, test_name, count, reported);
+ if (!err)
+ err = -EINVAL;
+ }
+
+unwind:
+ for_each_engine(other, i915, tmp) {
+ int ret;
+
+ if (!threads[tmp].task)
+ continue;
+
+ ret = kthread_stop(threads[tmp].task);
+ if (ret) {
+ pr_err("kthread for other engine %s failed, err=%d\n",
+ other->name, ret);
+ if (!err)
+ err = ret;
+ }
+ put_task_struct(threads[tmp].task);
+
+ if (other != engine &&
+ threads[tmp].resets !=
+ i915_reset_engine_count(&i915->gpu_error, other)) {
+ pr_err("Innocent engine %s was reset (count=%ld)\n",
+ other->name,
+ i915_reset_engine_count(&i915->gpu_error,
+ other) -
+ threads[tmp].resets);
+ if (!err)
+ err = -EINVAL;
+ }
+ }
+
+ if (global != i915_reset_count(&i915->gpu_error)) {
+ pr_err("Global reset (count=%ld)!\n",
+ i915_reset_count(&i915->gpu_error) - global);
+ if (!err)
+ err = -EINVAL;
+ }
+
+ if (err)
+ break;
+
+ err = igt_flush_test(i915, 0);
+ if (err)
+ break;
+ }
+
+ if (i915_reset_failed(i915))
+ err = -EIO;
+
+ if (flags & TEST_ACTIVE) {
+ mutex_lock(&i915->drm.struct_mutex);
+ hang_fini(&h);
+ mutex_unlock(&i915->drm.struct_mutex);
+ }
+
+ return err;
+}
+
+static int igt_reset_engines(void *arg)
+{
+ static const struct {
+ const char *name;
+ unsigned int flags;
+ } phases[] = {
+ { "idle", 0 },
+ { "active", TEST_ACTIVE },
+ { "others-idle", TEST_OTHERS },
+ { "others-active", TEST_OTHERS | TEST_ACTIVE },
+ {
+ "others-priority",
+ TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
+ },
+ {
+ "self-priority",
+ TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
+ },
+ { }
+ };
+ struct drm_i915_private *i915 = arg;
+ typeof(*phases) *p;
+ int err;
+
+ for (p = phases; p->name; p++) {
+ if (p->flags & TEST_PRIORITY) {
+ if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
+ continue;
+ }
+
+ err = __igt_reset_engines(arg, p->name, p->flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static u32 fake_hangcheck(struct drm_i915_private *i915,
+ intel_engine_mask_t mask)
+{
+ u32 count = i915_reset_count(&i915->gpu_error);
+
+ i915_reset(i915, mask, NULL);
+
+ return count;
+}
+
+static int igt_reset_wait(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct i915_request *rq;
+ unsigned int reset_count;
+ struct hang h;
+ long timeout;
+ int err;
+
+ if (!intel_engine_can_store_dword(i915->engine[RCS0]))
+ return 0;
+
+ /* Check that we detect a stuck waiter and issue a reset */
+
+ igt_global_reset_lock(i915);
+
+ mutex_lock(&i915->drm.struct_mutex);
+ err = hang_init(&h, i915);
+ if (err)
+ goto unlock;
+
+ rq = hang_create_request(&h, i915->engine[RCS0]);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto fini;
+ }
+
+ i915_request_get(rq);
+ i915_request_add(rq);
+
+ if (!wait_until_running(&h, rq)) {
+ struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+ pr_err("%s: Failed to start request %llx, at %x\n",
+ __func__, rq->fence.seqno, hws_seqno(&h, rq));
+ intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
+
+ i915_gem_set_wedged(i915);
+
+ err = -EIO;
+ goto out_rq;
+ }
+
+ reset_count = fake_hangcheck(i915, ALL_ENGINES);
+
+ timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
+ if (timeout < 0) {
+ pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
+ timeout);
+ err = timeout;
+ goto out_rq;
+ }
+
+ if (i915_reset_count(&i915->gpu_error) == reset_count) {
+ pr_err("No GPU reset recorded!\n");
+ err = -EINVAL;
+ goto out_rq;
+ }
+
+out_rq:
+ i915_request_put(rq);
+fini:
+ hang_fini(&h);
+unlock:
+ mutex_unlock(&i915->drm.struct_mutex);
+ igt_global_reset_unlock(i915);
+
+ if (i915_reset_failed(i915))
+ return -EIO;
+
+ return err;
+}
+
+struct evict_vma {
+ struct completion completion;
+ struct i915_vma *vma;
+};
+
+static int evict_vma(void *data)
+{
+ struct evict_vma *arg = data;
+ struct i915_address_space *vm = arg->vma->vm;
+ struct drm_i915_private *i915 = vm->i915;
+ struct drm_mm_node evict = arg->vma->node;
+ int err;
+
+ complete(&arg->completion);
+
+ mutex_lock(&i915->drm.struct_mutex);
+ err = i915_gem_evict_for_node(vm