blob: 71ad18b7cff6e3eda05b50d586dd976dc57edef5 [file] [log] [blame]
Tejun Heoece1d632006-04-02 18:51:53 +09001/*
2 * libata-eh.c - libata error handling
3 *
4 * Maintained by: Jeff Garzik <jgarzik@pobox.com>
5 * Please ALWAYS copy linux-ide@vger.kernel.org
6 * on emails.
7 *
8 * Copyright 2006 Tejun Heo <htejun@gmail.com>
9 *
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; see the file COPYING. If not, write to
23 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
24 * USA.
25 *
26 *
27 * libata documentation is available via 'make {ps|pdf}docs',
28 * as Documentation/DocBook/libata.*
29 *
30 * Hardware documentation available from http://www.t13.org/ and
31 * http://www.sata-io.org/
32 *
33 */
34
35#include <linux/config.h>
36#include <linux/kernel.h>
37#include <scsi/scsi.h>
38#include <scsi/scsi_host.h>
39#include <scsi/scsi_eh.h>
40#include <scsi/scsi_device.h>
41#include <scsi/scsi_cmnd.h>
42
43#include <linux/libata.h>
44
45#include "libata.h"
46
Tejun Heoad9e2762006-05-15 20:58:12 +090047static void __ata_port_freeze(struct ata_port *ap);
48
Tejun Heo0c247c52006-05-15 20:58:19 +090049static void ata_ering_record(struct ata_ering *ering, int is_io,
50 unsigned int err_mask)
51{
52 struct ata_ering_entry *ent;
53
54 WARN_ON(!err_mask);
55
56 ering->cursor++;
57 ering->cursor %= ATA_ERING_SIZE;
58
59 ent = &ering->ring[ering->cursor];
60 ent->is_io = is_io;
61 ent->err_mask = err_mask;
62 ent->timestamp = get_jiffies_64();
63}
64
65static struct ata_ering_entry * ata_ering_top(struct ata_ering *ering)
66{
67 struct ata_ering_entry *ent = &ering->ring[ering->cursor];
68 if (!ent->err_mask)
69 return NULL;
70 return ent;
71}
72
73static int ata_ering_map(struct ata_ering *ering,
74 int (*map_fn)(struct ata_ering_entry *, void *),
75 void *arg)
76{
77 int idx, rc = 0;
78 struct ata_ering_entry *ent;
79
80 idx = ering->cursor;
81 do {
82 ent = &ering->ring[idx];
83 if (!ent->err_mask)
84 break;
85 rc = map_fn(ent, arg);
86 if (rc)
87 break;
88 idx = (idx - 1 + ATA_ERING_SIZE) % ATA_ERING_SIZE;
89 } while (idx != ering->cursor);
90
91 return rc;
92}
93
Tejun Heoece1d632006-04-02 18:51:53 +090094/**
95 * ata_scsi_timed_out - SCSI layer time out callback
96 * @cmd: timed out SCSI command
97 *
98 * Handles SCSI layer timeout. We race with normal completion of
99 * the qc for @cmd. If the qc is already gone, we lose and let
100 * the scsi command finish (EH_HANDLED). Otherwise, the qc has
101 * timed out and EH should be invoked. Prevent ata_qc_complete()
102 * from finishing it by setting EH_SCHEDULED and return
103 * EH_NOT_HANDLED.
104 *
Tejun Heoad9e2762006-05-15 20:58:12 +0900105 * TODO: kill this function once old EH is gone.
106 *
Tejun Heoece1d632006-04-02 18:51:53 +0900107 * LOCKING:
108 * Called from timer context
109 *
110 * RETURNS:
111 * EH_HANDLED or EH_NOT_HANDLED
112 */
113enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
114{
115 struct Scsi_Host *host = cmd->device->host;
Jeff Garzik35bb94b2006-04-11 13:12:34 -0400116 struct ata_port *ap = ata_shost_to_port(host);
Tejun Heoece1d632006-04-02 18:51:53 +0900117 unsigned long flags;
118 struct ata_queued_cmd *qc;
Tejun Heoad9e2762006-05-15 20:58:12 +0900119 enum scsi_eh_timer_return ret;
Tejun Heoece1d632006-04-02 18:51:53 +0900120
121 DPRINTK("ENTER\n");
122
Tejun Heoad9e2762006-05-15 20:58:12 +0900123 if (ap->ops->error_handler) {
124 ret = EH_NOT_HANDLED;
125 goto out;
126 }
127
128 ret = EH_HANDLED;
Tejun Heoece1d632006-04-02 18:51:53 +0900129 spin_lock_irqsave(&ap->host_set->lock, flags);
130 qc = ata_qc_from_tag(ap, ap->active_tag);
131 if (qc) {
132 WARN_ON(qc->scsicmd != cmd);
133 qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
134 qc->err_mask |= AC_ERR_TIMEOUT;
135 ret = EH_NOT_HANDLED;
136 }
137 spin_unlock_irqrestore(&ap->host_set->lock, flags);
138
Tejun Heoad9e2762006-05-15 20:58:12 +0900139 out:
Tejun Heoece1d632006-04-02 18:51:53 +0900140 DPRINTK("EXIT, ret=%d\n", ret);
141 return ret;
142}
143
144/**
145 * ata_scsi_error - SCSI layer error handler callback
146 * @host: SCSI host on which error occurred
147 *
148 * Handles SCSI-layer-thrown error events.
149 *
150 * LOCKING:
151 * Inherited from SCSI layer (none, can sleep)
152 *
153 * RETURNS:
154 * Zero.
155 */
Jeff Garzik381544b2006-04-11 13:04:39 -0400156void ata_scsi_error(struct Scsi_Host *host)
Tejun Heoece1d632006-04-02 18:51:53 +0900157{
Jeff Garzik35bb94b2006-04-11 13:12:34 -0400158 struct ata_port *ap = ata_shost_to_port(host);
Tejun Heoad9e2762006-05-15 20:58:12 +0900159 spinlock_t *hs_lock = &ap->host_set->lock;
160 int i, repeat_cnt = ATA_EH_MAX_REPEAT;
161 unsigned long flags;
Tejun Heoece1d632006-04-02 18:51:53 +0900162
163 DPRINTK("ENTER\n");
164
Tejun Heoad9e2762006-05-15 20:58:12 +0900165 /* synchronize with port task */
Tejun Heoece1d632006-04-02 18:51:53 +0900166 ata_port_flush_task(ap);
167
Tejun Heoad9e2762006-05-15 20:58:12 +0900168 /* synchronize with host_set lock and sort out timeouts */
Tejun Heoece1d632006-04-02 18:51:53 +0900169
Tejun Heoad9e2762006-05-15 20:58:12 +0900170 /* For new EH, all qcs are finished in one of three ways -
171 * normal completion, error completion, and SCSI timeout.
172 * Both cmpletions can race against SCSI timeout. When normal
173 * completion wins, the qc never reaches EH. When error
174 * completion wins, the qc has ATA_QCFLAG_FAILED set.
175 *
176 * When SCSI timeout wins, things are a bit more complex.
177 * Normal or error completion can occur after the timeout but
178 * before this point. In such cases, both types of
179 * completions are honored. A scmd is determined to have
180 * timed out iff its associated qc is active and not failed.
181 */
182 if (ap->ops->error_handler) {
183 struct scsi_cmnd *scmd, *tmp;
184 int nr_timedout = 0;
Tejun Heoece1d632006-04-02 18:51:53 +0900185
Tejun Heoad9e2762006-05-15 20:58:12 +0900186 spin_lock_irqsave(hs_lock, flags);
187
188 list_for_each_entry_safe(scmd, tmp, &host->eh_cmd_q, eh_entry) {
189 struct ata_queued_cmd *qc;
190
191 for (i = 0; i < ATA_MAX_QUEUE; i++) {
192 qc = __ata_qc_from_tag(ap, i);
193 if (qc->flags & ATA_QCFLAG_ACTIVE &&
194 qc->scsicmd == scmd)
195 break;
196 }
197
198 if (i < ATA_MAX_QUEUE) {
199 /* the scmd has an associated qc */
200 if (!(qc->flags & ATA_QCFLAG_FAILED)) {
201 /* which hasn't failed yet, timeout */
202 qc->err_mask |= AC_ERR_TIMEOUT;
203 qc->flags |= ATA_QCFLAG_FAILED;
204 nr_timedout++;
205 }
206 } else {
207 /* Normal completion occurred after
208 * SCSI timeout but before this point.
209 * Successfully complete it.
210 */
211 scmd->retries = scmd->allowed;
212 scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
213 }
214 }
215
216 /* If we have timed out qcs. They belong to EH from
217 * this point but the state of the controller is
218 * unknown. Freeze the port to make sure the IRQ
219 * handler doesn't diddle with those qcs. This must
220 * be done atomically w.r.t. setting QCFLAG_FAILED.
221 */
222 if (nr_timedout)
223 __ata_port_freeze(ap);
224
225 spin_unlock_irqrestore(hs_lock, flags);
226 } else
227 spin_unlock_wait(hs_lock);
228
229 repeat:
230 /* invoke error handler */
231 if (ap->ops->error_handler) {
232 /* clear EH pending */
233 spin_lock_irqsave(hs_lock, flags);
234 ap->flags &= ~ATA_FLAG_EH_PENDING;
235 spin_unlock_irqrestore(hs_lock, flags);
236
237 /* invoke EH */
238 ap->ops->error_handler(ap);
239
240 /* Exception might have happend after ->error_handler
241 * recovered the port but before this point. Repeat
242 * EH in such case.
243 */
244 spin_lock_irqsave(hs_lock, flags);
245
246 if (ap->flags & ATA_FLAG_EH_PENDING) {
247 if (--repeat_cnt) {
248 ata_port_printk(ap, KERN_INFO,
249 "EH pending after completion, "
250 "repeating EH (cnt=%d)\n", repeat_cnt);
251 spin_unlock_irqrestore(hs_lock, flags);
252 goto repeat;
253 }
254 ata_port_printk(ap, KERN_ERR, "EH pending after %d "
255 "tries, giving up\n", ATA_EH_MAX_REPEAT);
256 }
257
258 /* Clear host_eh_scheduled while holding hs_lock such
259 * that if exception occurs after this point but
260 * before EH completion, SCSI midlayer will
261 * re-initiate EH.
262 */
263 host->host_eh_scheduled = 0;
264
265 spin_unlock_irqrestore(hs_lock, flags);
266 } else {
267 WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
268 ap->ops->eng_timeout(ap);
269 }
270
271 /* finish or retry handled scmd's and clean up */
Tejun Heoece1d632006-04-02 18:51:53 +0900272 WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q));
273
274 scsi_eh_flush_done_q(&ap->eh_done_q);
275
Tejun Heoad9e2762006-05-15 20:58:12 +0900276 /* clean up */
277 spin_lock_irqsave(hs_lock, flags);
278
279 if (ap->flags & ATA_FLAG_RECOVERED)
280 ata_port_printk(ap, KERN_INFO, "EH complete\n");
281 ap->flags &= ~ATA_FLAG_RECOVERED;
282
283 spin_unlock_irqrestore(hs_lock, flags);
284
Tejun Heoece1d632006-04-02 18:51:53 +0900285 DPRINTK("EXIT\n");
Tejun Heoece1d632006-04-02 18:51:53 +0900286}
287
288/**
289 * ata_qc_timeout - Handle timeout of queued command
290 * @qc: Command that timed out
291 *
292 * Some part of the kernel (currently, only the SCSI layer)
293 * has noticed that the active command on port @ap has not
294 * completed after a specified length of time. Handle this
295 * condition by disabling DMA (if necessary) and completing
296 * transactions, with error if necessary.
297 *
298 * This also handles the case of the "lost interrupt", where
299 * for some reason (possibly hardware bug, possibly driver bug)
300 * an interrupt was not delivered to the driver, even though the
301 * transaction completed successfully.
302 *
Tejun Heoad9e2762006-05-15 20:58:12 +0900303 * TODO: kill this function once old EH is gone.
304 *
Tejun Heoece1d632006-04-02 18:51:53 +0900305 * LOCKING:
306 * Inherited from SCSI layer (none, can sleep)
307 */
308static void ata_qc_timeout(struct ata_queued_cmd *qc)
309{
310 struct ata_port *ap = qc->ap;
311 struct ata_host_set *host_set = ap->host_set;
312 u8 host_stat = 0, drv_stat;
313 unsigned long flags;
314
315 DPRINTK("ENTER\n");
316
317 ap->hsm_task_state = HSM_ST_IDLE;
318
319 spin_lock_irqsave(&host_set->lock, flags);
320
321 switch (qc->tf.protocol) {
322
323 case ATA_PROT_DMA:
324 case ATA_PROT_ATAPI_DMA:
325 host_stat = ap->ops->bmdma_status(ap);
326
327 /* before we do anything else, clear DMA-Start bit */
328 ap->ops->bmdma_stop(qc);
329
330 /* fall through */
331
332 default:
333 ata_altstatus(ap);
334 drv_stat = ata_chk_status(ap);
335
336 /* ack bmdma irq events */
337 ap->ops->irq_clear(ap);
338
Tejun Heof15a1da2006-05-15 20:57:56 +0900339 ata_dev_printk(qc->dev, KERN_ERR, "command 0x%x timeout, "
340 "stat 0x%x host_stat 0x%x\n",
341 qc->tf.command, drv_stat, host_stat);
Tejun Heoece1d632006-04-02 18:51:53 +0900342
343 /* complete taskfile transaction */
344 qc->err_mask |= ac_err_mask(drv_stat);
345 break;
346 }
347
348 spin_unlock_irqrestore(&host_set->lock, flags);
349
350 ata_eh_qc_complete(qc);
351
352 DPRINTK("EXIT\n");
353}
354
355/**
356 * ata_eng_timeout - Handle timeout of queued command
357 * @ap: Port on which timed-out command is active
358 *
359 * Some part of the kernel (currently, only the SCSI layer)
360 * has noticed that the active command on port @ap has not
361 * completed after a specified length of time. Handle this
362 * condition by disabling DMA (if necessary) and completing
363 * transactions, with error if necessary.
364 *
365 * This also handles the case of the "lost interrupt", where
366 * for some reason (possibly hardware bug, possibly driver bug)
367 * an interrupt was not delivered to the driver, even though the
368 * transaction completed successfully.
369 *
Tejun Heoad9e2762006-05-15 20:58:12 +0900370 * TODO: kill this function once old EH is gone.
371 *
Tejun Heoece1d632006-04-02 18:51:53 +0900372 * LOCKING:
373 * Inherited from SCSI layer (none, can sleep)
374 */
375void ata_eng_timeout(struct ata_port *ap)
376{
377 DPRINTK("ENTER\n");
378
379 ata_qc_timeout(ata_qc_from_tag(ap, ap->active_tag));
380
381 DPRINTK("EXIT\n");
382}
383
Tejun Heof686bcb2006-05-15 20:58:05 +0900384/**
385 * ata_qc_schedule_eh - schedule qc for error handling
386 * @qc: command to schedule error handling for
387 *
388 * Schedule error handling for @qc. EH will kick in as soon as
389 * other commands are drained.
390 *
391 * LOCKING:
392 * spin_lock_irqsave(host_set lock)
393 */
394void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
395{
396 struct ata_port *ap = qc->ap;
397
398 WARN_ON(!ap->ops->error_handler);
399
400 qc->flags |= ATA_QCFLAG_FAILED;
401 qc->ap->flags |= ATA_FLAG_EH_PENDING;
402
403 /* The following will fail if timeout has already expired.
404 * ata_scsi_error() takes care of such scmds on EH entry.
405 * Note that ATA_QCFLAG_FAILED is unconditionally set after
406 * this function completes.
407 */
408 scsi_req_abort_cmd(qc->scsicmd);
409}
410
Tejun Heo7b70fc02006-05-15 20:58:07 +0900411/**
412 * ata_port_schedule_eh - schedule error handling without a qc
413 * @ap: ATA port to schedule EH for
414 *
415 * Schedule error handling for @ap. EH will kick in as soon as
416 * all commands are drained.
417 *
418 * LOCKING:
419 * spin_lock_irqsave(host_set lock)
420 */
421void ata_port_schedule_eh(struct ata_port *ap)
422{
423 WARN_ON(!ap->ops->error_handler);
424
425 ap->flags |= ATA_FLAG_EH_PENDING;
426 ata_schedule_scsi_eh(ap->host);
427
428 DPRINTK("port EH scheduled\n");
429}
430
431/**
432 * ata_port_abort - abort all qc's on the port
433 * @ap: ATA port to abort qc's for
434 *
435 * Abort all active qc's of @ap and schedule EH.
436 *
437 * LOCKING:
438 * spin_lock_irqsave(host_set lock)
439 *
440 * RETURNS:
441 * Number of aborted qc's.
442 */
443int ata_port_abort(struct ata_port *ap)
444{
445 int tag, nr_aborted = 0;
446
447 WARN_ON(!ap->ops->error_handler);
448
449 for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
450 struct ata_queued_cmd *qc = ata_qc_from_tag(ap, tag);
451
452 if (qc) {
453 qc->flags |= ATA_QCFLAG_FAILED;
454 ata_qc_complete(qc);
455 nr_aborted++;
456 }
457 }
458
459 if (!nr_aborted)
460 ata_port_schedule_eh(ap);
461
462 return nr_aborted;
463}
464
Tejun Heoe3180492006-05-15 20:58:09 +0900465/**
466 * __ata_port_freeze - freeze port
467 * @ap: ATA port to freeze
468 *
469 * This function is called when HSM violation or some other
470 * condition disrupts normal operation of the port. Frozen port
471 * is not allowed to perform any operation until the port is
472 * thawed, which usually follows a successful reset.
473 *
474 * ap->ops->freeze() callback can be used for freezing the port
475 * hardware-wise (e.g. mask interrupt and stop DMA engine). If a
476 * port cannot be frozen hardware-wise, the interrupt handler
477 * must ack and clear interrupts unconditionally while the port
478 * is frozen.
479 *
480 * LOCKING:
481 * spin_lock_irqsave(host_set lock)
482 */
483static void __ata_port_freeze(struct ata_port *ap)
484{
485 WARN_ON(!ap->ops->error_handler);
486
487 if (ap->ops->freeze)
488 ap->ops->freeze(ap);
489
490 ap->flags |= ATA_FLAG_FROZEN;
491
492 DPRINTK("ata%u port frozen\n", ap->id);
493}
494
495/**
496 * ata_port_freeze - abort & freeze port
497 * @ap: ATA port to freeze
498 *
499 * Abort and freeze @ap.
500 *
501 * LOCKING:
502 * spin_lock_irqsave(host_set lock)
503 *
504 * RETURNS:
505 * Number of aborted commands.
506 */
507int ata_port_freeze(struct ata_port *ap)
508{
509 int nr_aborted;
510
511 WARN_ON(!ap->ops->error_handler);
512
513 nr_aborted = ata_port_abort(ap);
514 __ata_port_freeze(ap);
515
516 return nr_aborted;
517}
518
519/**
520 * ata_eh_freeze_port - EH helper to freeze port
521 * @ap: ATA port to freeze
522 *
523 * Freeze @ap.
524 *
525 * LOCKING:
526 * None.
527 */
528void ata_eh_freeze_port(struct ata_port *ap)
529{
530 unsigned long flags;
531
532 if (!ap->ops->error_handler)
533 return;
534
535 spin_lock_irqsave(&ap->host_set->lock, flags);
536 __ata_port_freeze(ap);
537 spin_unlock_irqrestore(&ap->host_set->lock, flags);
538}
539
540/**
541 * ata_port_thaw_port - EH helper to thaw port
542 * @ap: ATA port to thaw
543 *
544 * Thaw frozen port @ap.
545 *
546 * LOCKING:
547 * None.
548 */
549void ata_eh_thaw_port(struct ata_port *ap)
550{
551 unsigned long flags;
552
553 if (!ap->ops->error_handler)
554 return;
555
556 spin_lock_irqsave(&ap->host_set->lock, flags);
557
558 ap->flags &= ~ATA_FLAG_FROZEN;
559
560 if (ap->ops->thaw)
561 ap->ops->thaw(ap);
562
563 spin_unlock_irqrestore(&ap->host_set->lock, flags);
564
565 DPRINTK("ata%u port thawed\n", ap->id);
566}
567
Tejun Heoece1d632006-04-02 18:51:53 +0900568static void ata_eh_scsidone(struct scsi_cmnd *scmd)
569{
570 /* nada */
571}
572
573static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
574{
575 struct ata_port *ap = qc->ap;
576 struct scsi_cmnd *scmd = qc->scsicmd;
577 unsigned long flags;
578
579 spin_lock_irqsave(&ap->host_set->lock, flags);
580 qc->scsidone = ata_eh_scsidone;
581 __ata_qc_complete(qc);
582 WARN_ON(ata_tag_valid(qc->tag));
583 spin_unlock_irqrestore(&ap->host_set->lock, flags);
584
585 scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
586}
587
588/**
589 * ata_eh_qc_complete - Complete an active ATA command from EH
590 * @qc: Command to complete
591 *
592 * Indicate to the mid and upper layers that an ATA command has
593 * completed. To be used from EH.
594 */
595void ata_eh_qc_complete(struct ata_queued_cmd *qc)
596{
597 struct scsi_cmnd *scmd = qc->scsicmd;
598 scmd->retries = scmd->allowed;
599 __ata_eh_qc_complete(qc);
600}
601
602/**
603 * ata_eh_qc_retry - Tell midlayer to retry an ATA command after EH
604 * @qc: Command to retry
605 *
606 * Indicate to the mid and upper layers that an ATA command
607 * should be retried. To be used from EH.
608 *
609 * SCSI midlayer limits the number of retries to scmd->allowed.
610 * scmd->retries is decremented for commands which get retried
611 * due to unrelated failures (qc->err_mask is zero).
612 */
613void ata_eh_qc_retry(struct ata_queued_cmd *qc)
614{
615 struct scsi_cmnd *scmd = qc->scsicmd;
616 if (!qc->err_mask && scmd->retries)
617 scmd->retries--;
618 __ata_eh_qc_complete(qc);
619}