blob: 0803231f65779776e96383f4452a09adc16c0f42 [file] [log] [blame]
Tejun Heoece1d632006-04-02 18:51:53 +09001/*
2 * libata-eh.c - libata error handling
3 *
4 * Maintained by: Jeff Garzik <jgarzik@pobox.com>
5 * Please ALWAYS copy linux-ide@vger.kernel.org
6 * on emails.
7 *
8 * Copyright 2006 Tejun Heo <htejun@gmail.com>
9 *
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; see the file COPYING. If not, write to
23 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
24 * USA.
25 *
26 *
27 * libata documentation is available via 'make {ps|pdf}docs',
28 * as Documentation/DocBook/libata.*
29 *
30 * Hardware documentation available from http://www.t13.org/ and
31 * http://www.sata-io.org/
32 *
33 */
34
35#include <linux/config.h>
36#include <linux/kernel.h>
37#include <scsi/scsi.h>
38#include <scsi/scsi_host.h>
39#include <scsi/scsi_eh.h>
40#include <scsi/scsi_device.h>
41#include <scsi/scsi_cmnd.h>
42
43#include <linux/libata.h>
44
45#include "libata.h"
46
Tejun Heoad9e2762006-05-15 20:58:12 +090047static void __ata_port_freeze(struct ata_port *ap);
48
Tejun Heoece1d632006-04-02 18:51:53 +090049/**
50 * ata_scsi_timed_out - SCSI layer time out callback
51 * @cmd: timed out SCSI command
52 *
53 * Handles SCSI layer timeout. We race with normal completion of
54 * the qc for @cmd. If the qc is already gone, we lose and let
55 * the scsi command finish (EH_HANDLED). Otherwise, the qc has
56 * timed out and EH should be invoked. Prevent ata_qc_complete()
57 * from finishing it by setting EH_SCHEDULED and return
58 * EH_NOT_HANDLED.
59 *
Tejun Heoad9e2762006-05-15 20:58:12 +090060 * TODO: kill this function once old EH is gone.
61 *
Tejun Heoece1d632006-04-02 18:51:53 +090062 * LOCKING:
63 * Called from timer context
64 *
65 * RETURNS:
66 * EH_HANDLED or EH_NOT_HANDLED
67 */
68enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
69{
70 struct Scsi_Host *host = cmd->device->host;
Jeff Garzik35bb94b2006-04-11 13:12:34 -040071 struct ata_port *ap = ata_shost_to_port(host);
Tejun Heoece1d632006-04-02 18:51:53 +090072 unsigned long flags;
73 struct ata_queued_cmd *qc;
Tejun Heoad9e2762006-05-15 20:58:12 +090074 enum scsi_eh_timer_return ret;
Tejun Heoece1d632006-04-02 18:51:53 +090075
76 DPRINTK("ENTER\n");
77
Tejun Heoad9e2762006-05-15 20:58:12 +090078 if (ap->ops->error_handler) {
79 ret = EH_NOT_HANDLED;
80 goto out;
81 }
82
83 ret = EH_HANDLED;
Tejun Heoece1d632006-04-02 18:51:53 +090084 spin_lock_irqsave(&ap->host_set->lock, flags);
85 qc = ata_qc_from_tag(ap, ap->active_tag);
86 if (qc) {
87 WARN_ON(qc->scsicmd != cmd);
88 qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
89 qc->err_mask |= AC_ERR_TIMEOUT;
90 ret = EH_NOT_HANDLED;
91 }
92 spin_unlock_irqrestore(&ap->host_set->lock, flags);
93
Tejun Heoad9e2762006-05-15 20:58:12 +090094 out:
Tejun Heoece1d632006-04-02 18:51:53 +090095 DPRINTK("EXIT, ret=%d\n", ret);
96 return ret;
97}
98
99/**
100 * ata_scsi_error - SCSI layer error handler callback
101 * @host: SCSI host on which error occurred
102 *
103 * Handles SCSI-layer-thrown error events.
104 *
105 * LOCKING:
106 * Inherited from SCSI layer (none, can sleep)
107 *
108 * RETURNS:
109 * Zero.
110 */
Jeff Garzik381544b2006-04-11 13:04:39 -0400111void ata_scsi_error(struct Scsi_Host *host)
Tejun Heoece1d632006-04-02 18:51:53 +0900112{
Jeff Garzik35bb94b2006-04-11 13:12:34 -0400113 struct ata_port *ap = ata_shost_to_port(host);
Tejun Heoad9e2762006-05-15 20:58:12 +0900114 spinlock_t *hs_lock = &ap->host_set->lock;
115 int i, repeat_cnt = ATA_EH_MAX_REPEAT;
116 unsigned long flags;
Tejun Heoece1d632006-04-02 18:51:53 +0900117
118 DPRINTK("ENTER\n");
119
Tejun Heoad9e2762006-05-15 20:58:12 +0900120 /* synchronize with port task */
Tejun Heoece1d632006-04-02 18:51:53 +0900121 ata_port_flush_task(ap);
122
Tejun Heoad9e2762006-05-15 20:58:12 +0900123 /* synchronize with host_set lock and sort out timeouts */
Tejun Heoece1d632006-04-02 18:51:53 +0900124
Tejun Heoad9e2762006-05-15 20:58:12 +0900125 /* For new EH, all qcs are finished in one of three ways -
126 * normal completion, error completion, and SCSI timeout.
127 * Both cmpletions can race against SCSI timeout. When normal
128 * completion wins, the qc never reaches EH. When error
129 * completion wins, the qc has ATA_QCFLAG_FAILED set.
130 *
131 * When SCSI timeout wins, things are a bit more complex.
132 * Normal or error completion can occur after the timeout but
133 * before this point. In such cases, both types of
134 * completions are honored. A scmd is determined to have
135 * timed out iff its associated qc is active and not failed.
136 */
137 if (ap->ops->error_handler) {
138 struct scsi_cmnd *scmd, *tmp;
139 int nr_timedout = 0;
Tejun Heoece1d632006-04-02 18:51:53 +0900140
Tejun Heoad9e2762006-05-15 20:58:12 +0900141 spin_lock_irqsave(hs_lock, flags);
142
143 list_for_each_entry_safe(scmd, tmp, &host->eh_cmd_q, eh_entry) {
144 struct ata_queued_cmd *qc;
145
146 for (i = 0; i < ATA_MAX_QUEUE; i++) {
147 qc = __ata_qc_from_tag(ap, i);
148 if (qc->flags & ATA_QCFLAG_ACTIVE &&
149 qc->scsicmd == scmd)
150 break;
151 }
152
153 if (i < ATA_MAX_QUEUE) {
154 /* the scmd has an associated qc */
155 if (!(qc->flags & ATA_QCFLAG_FAILED)) {
156 /* which hasn't failed yet, timeout */
157 qc->err_mask |= AC_ERR_TIMEOUT;
158 qc->flags |= ATA_QCFLAG_FAILED;
159 nr_timedout++;
160 }
161 } else {
162 /* Normal completion occurred after
163 * SCSI timeout but before this point.
164 * Successfully complete it.
165 */
166 scmd->retries = scmd->allowed;
167 scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
168 }
169 }
170
171 /* If we have timed out qcs. They belong to EH from
172 * this point but the state of the controller is
173 * unknown. Freeze the port to make sure the IRQ
174 * handler doesn't diddle with those qcs. This must
175 * be done atomically w.r.t. setting QCFLAG_FAILED.
176 */
177 if (nr_timedout)
178 __ata_port_freeze(ap);
179
180 spin_unlock_irqrestore(hs_lock, flags);
181 } else
182 spin_unlock_wait(hs_lock);
183
184 repeat:
185 /* invoke error handler */
186 if (ap->ops->error_handler) {
187 /* clear EH pending */
188 spin_lock_irqsave(hs_lock, flags);
189 ap->flags &= ~ATA_FLAG_EH_PENDING;
190 spin_unlock_irqrestore(hs_lock, flags);
191
192 /* invoke EH */
193 ap->ops->error_handler(ap);
194
195 /* Exception might have happend after ->error_handler
196 * recovered the port but before this point. Repeat
197 * EH in such case.
198 */
199 spin_lock_irqsave(hs_lock, flags);
200
201 if (ap->flags & ATA_FLAG_EH_PENDING) {
202 if (--repeat_cnt) {
203 ata_port_printk(ap, KERN_INFO,
204 "EH pending after completion, "
205 "repeating EH (cnt=%d)\n", repeat_cnt);
206 spin_unlock_irqrestore(hs_lock, flags);
207 goto repeat;
208 }
209 ata_port_printk(ap, KERN_ERR, "EH pending after %d "
210 "tries, giving up\n", ATA_EH_MAX_REPEAT);
211 }
212
213 /* Clear host_eh_scheduled while holding hs_lock such
214 * that if exception occurs after this point but
215 * before EH completion, SCSI midlayer will
216 * re-initiate EH.
217 */
218 host->host_eh_scheduled = 0;
219
220 spin_unlock_irqrestore(hs_lock, flags);
221 } else {
222 WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
223 ap->ops->eng_timeout(ap);
224 }
225
226 /* finish or retry handled scmd's and clean up */
Tejun Heoece1d632006-04-02 18:51:53 +0900227 WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q));
228
229 scsi_eh_flush_done_q(&ap->eh_done_q);
230
Tejun Heoad9e2762006-05-15 20:58:12 +0900231 /* clean up */
232 spin_lock_irqsave(hs_lock, flags);
233
234 if (ap->flags & ATA_FLAG_RECOVERED)
235 ata_port_printk(ap, KERN_INFO, "EH complete\n");
236 ap->flags &= ~ATA_FLAG_RECOVERED;
237
238 spin_unlock_irqrestore(hs_lock, flags);
239
Tejun Heoece1d632006-04-02 18:51:53 +0900240 DPRINTK("EXIT\n");
Tejun Heoece1d632006-04-02 18:51:53 +0900241}
242
243/**
244 * ata_qc_timeout - Handle timeout of queued command
245 * @qc: Command that timed out
246 *
247 * Some part of the kernel (currently, only the SCSI layer)
248 * has noticed that the active command on port @ap has not
249 * completed after a specified length of time. Handle this
250 * condition by disabling DMA (if necessary) and completing
251 * transactions, with error if necessary.
252 *
253 * This also handles the case of the "lost interrupt", where
254 * for some reason (possibly hardware bug, possibly driver bug)
255 * an interrupt was not delivered to the driver, even though the
256 * transaction completed successfully.
257 *
Tejun Heoad9e2762006-05-15 20:58:12 +0900258 * TODO: kill this function once old EH is gone.
259 *
Tejun Heoece1d632006-04-02 18:51:53 +0900260 * LOCKING:
261 * Inherited from SCSI layer (none, can sleep)
262 */
263static void ata_qc_timeout(struct ata_queued_cmd *qc)
264{
265 struct ata_port *ap = qc->ap;
266 struct ata_host_set *host_set = ap->host_set;
267 u8 host_stat = 0, drv_stat;
268 unsigned long flags;
269
270 DPRINTK("ENTER\n");
271
272 ap->hsm_task_state = HSM_ST_IDLE;
273
274 spin_lock_irqsave(&host_set->lock, flags);
275
276 switch (qc->tf.protocol) {
277
278 case ATA_PROT_DMA:
279 case ATA_PROT_ATAPI_DMA:
280 host_stat = ap->ops->bmdma_status(ap);
281
282 /* before we do anything else, clear DMA-Start bit */
283 ap->ops->bmdma_stop(qc);
284
285 /* fall through */
286
287 default:
288 ata_altstatus(ap);
289 drv_stat = ata_chk_status(ap);
290
291 /* ack bmdma irq events */
292 ap->ops->irq_clear(ap);
293
Tejun Heof15a1da2006-05-15 20:57:56 +0900294 ata_dev_printk(qc->dev, KERN_ERR, "command 0x%x timeout, "
295 "stat 0x%x host_stat 0x%x\n",
296 qc->tf.command, drv_stat, host_stat);
Tejun Heoece1d632006-04-02 18:51:53 +0900297
298 /* complete taskfile transaction */
299 qc->err_mask |= ac_err_mask(drv_stat);
300 break;
301 }
302
303 spin_unlock_irqrestore(&host_set->lock, flags);
304
305 ata_eh_qc_complete(qc);
306
307 DPRINTK("EXIT\n");
308}
309
310/**
311 * ata_eng_timeout - Handle timeout of queued command
312 * @ap: Port on which timed-out command is active
313 *
314 * Some part of the kernel (currently, only the SCSI layer)
315 * has noticed that the active command on port @ap has not
316 * completed after a specified length of time. Handle this
317 * condition by disabling DMA (if necessary) and completing
318 * transactions, with error if necessary.
319 *
320 * This also handles the case of the "lost interrupt", where
321 * for some reason (possibly hardware bug, possibly driver bug)
322 * an interrupt was not delivered to the driver, even though the
323 * transaction completed successfully.
324 *
Tejun Heoad9e2762006-05-15 20:58:12 +0900325 * TODO: kill this function once old EH is gone.
326 *
Tejun Heoece1d632006-04-02 18:51:53 +0900327 * LOCKING:
328 * Inherited from SCSI layer (none, can sleep)
329 */
330void ata_eng_timeout(struct ata_port *ap)
331{
332 DPRINTK("ENTER\n");
333
334 ata_qc_timeout(ata_qc_from_tag(ap, ap->active_tag));
335
336 DPRINTK("EXIT\n");
337}
338
Tejun Heof686bcb2006-05-15 20:58:05 +0900339/**
340 * ata_qc_schedule_eh - schedule qc for error handling
341 * @qc: command to schedule error handling for
342 *
343 * Schedule error handling for @qc. EH will kick in as soon as
344 * other commands are drained.
345 *
346 * LOCKING:
347 * spin_lock_irqsave(host_set lock)
348 */
349void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
350{
351 struct ata_port *ap = qc->ap;
352
353 WARN_ON(!ap->ops->error_handler);
354
355 qc->flags |= ATA_QCFLAG_FAILED;
356 qc->ap->flags |= ATA_FLAG_EH_PENDING;
357
358 /* The following will fail if timeout has already expired.
359 * ata_scsi_error() takes care of such scmds on EH entry.
360 * Note that ATA_QCFLAG_FAILED is unconditionally set after
361 * this function completes.
362 */
363 scsi_req_abort_cmd(qc->scsicmd);
364}
365
Tejun Heo7b70fc02006-05-15 20:58:07 +0900366/**
367 * ata_port_schedule_eh - schedule error handling without a qc
368 * @ap: ATA port to schedule EH for
369 *
370 * Schedule error handling for @ap. EH will kick in as soon as
371 * all commands are drained.
372 *
373 * LOCKING:
374 * spin_lock_irqsave(host_set lock)
375 */
376void ata_port_schedule_eh(struct ata_port *ap)
377{
378 WARN_ON(!ap->ops->error_handler);
379
380 ap->flags |= ATA_FLAG_EH_PENDING;
381 ata_schedule_scsi_eh(ap->host);
382
383 DPRINTK("port EH scheduled\n");
384}
385
386/**
387 * ata_port_abort - abort all qc's on the port
388 * @ap: ATA port to abort qc's for
389 *
390 * Abort all active qc's of @ap and schedule EH.
391 *
392 * LOCKING:
393 * spin_lock_irqsave(host_set lock)
394 *
395 * RETURNS:
396 * Number of aborted qc's.
397 */
398int ata_port_abort(struct ata_port *ap)
399{
400 int tag, nr_aborted = 0;
401
402 WARN_ON(!ap->ops->error_handler);
403
404 for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
405 struct ata_queued_cmd *qc = ata_qc_from_tag(ap, tag);
406
407 if (qc) {
408 qc->flags |= ATA_QCFLAG_FAILED;
409 ata_qc_complete(qc);
410 nr_aborted++;
411 }
412 }
413
414 if (!nr_aborted)
415 ata_port_schedule_eh(ap);
416
417 return nr_aborted;
418}
419
Tejun Heoe3180492006-05-15 20:58:09 +0900420/**
421 * __ata_port_freeze - freeze port
422 * @ap: ATA port to freeze
423 *
424 * This function is called when HSM violation or some other
425 * condition disrupts normal operation of the port. Frozen port
426 * is not allowed to perform any operation until the port is
427 * thawed, which usually follows a successful reset.
428 *
429 * ap->ops->freeze() callback can be used for freezing the port
430 * hardware-wise (e.g. mask interrupt and stop DMA engine). If a
431 * port cannot be frozen hardware-wise, the interrupt handler
432 * must ack and clear interrupts unconditionally while the port
433 * is frozen.
434 *
435 * LOCKING:
436 * spin_lock_irqsave(host_set lock)
437 */
438static void __ata_port_freeze(struct ata_port *ap)
439{
440 WARN_ON(!ap->ops->error_handler);
441
442 if (ap->ops->freeze)
443 ap->ops->freeze(ap);
444
445 ap->flags |= ATA_FLAG_FROZEN;
446
447 DPRINTK("ata%u port frozen\n", ap->id);
448}
449
450/**
451 * ata_port_freeze - abort & freeze port
452 * @ap: ATA port to freeze
453 *
454 * Abort and freeze @ap.
455 *
456 * LOCKING:
457 * spin_lock_irqsave(host_set lock)
458 *
459 * RETURNS:
460 * Number of aborted commands.
461 */
462int ata_port_freeze(struct ata_port *ap)
463{
464 int nr_aborted;
465
466 WARN_ON(!ap->ops->error_handler);
467
468 nr_aborted = ata_port_abort(ap);
469 __ata_port_freeze(ap);
470
471 return nr_aborted;
472}
473
474/**
475 * ata_eh_freeze_port - EH helper to freeze port
476 * @ap: ATA port to freeze
477 *
478 * Freeze @ap.
479 *
480 * LOCKING:
481 * None.
482 */
483void ata_eh_freeze_port(struct ata_port *ap)
484{
485 unsigned long flags;
486
487 if (!ap->ops->error_handler)
488 return;
489
490 spin_lock_irqsave(&ap->host_set->lock, flags);
491 __ata_port_freeze(ap);
492 spin_unlock_irqrestore(&ap->host_set->lock, flags);
493}
494
495/**
496 * ata_port_thaw_port - EH helper to thaw port
497 * @ap: ATA port to thaw
498 *
499 * Thaw frozen port @ap.
500 *
501 * LOCKING:
502 * None.
503 */
504void ata_eh_thaw_port(struct ata_port *ap)
505{
506 unsigned long flags;
507
508 if (!ap->ops->error_handler)
509 return;
510
511 spin_lock_irqsave(&ap->host_set->lock, flags);
512
513 ap->flags &= ~ATA_FLAG_FROZEN;
514
515 if (ap->ops->thaw)
516 ap->ops->thaw(ap);
517
518 spin_unlock_irqrestore(&ap->host_set->lock, flags);
519
520 DPRINTK("ata%u port thawed\n", ap->id);
521}
522
Tejun Heoece1d632006-04-02 18:51:53 +0900523static void ata_eh_scsidone(struct scsi_cmnd *scmd)
524{
525 /* nada */
526}
527
528static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
529{
530 struct ata_port *ap = qc->ap;
531 struct scsi_cmnd *scmd = qc->scsicmd;
532 unsigned long flags;
533
534 spin_lock_irqsave(&ap->host_set->lock, flags);
535 qc->scsidone = ata_eh_scsidone;
536 __ata_qc_complete(qc);
537 WARN_ON(ata_tag_valid(qc->tag));
538 spin_unlock_irqrestore(&ap->host_set->lock, flags);
539
540 scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
541}
542
543/**
544 * ata_eh_qc_complete - Complete an active ATA command from EH
545 * @qc: Command to complete
546 *
547 * Indicate to the mid and upper layers that an ATA command has
548 * completed. To be used from EH.
549 */
550void ata_eh_qc_complete(struct ata_queued_cmd *qc)
551{
552 struct scsi_cmnd *scmd = qc->scsicmd;
553 scmd->retries = scmd->allowed;
554 __ata_eh_qc_complete(qc);
555}
556
557/**
558 * ata_eh_qc_retry - Tell midlayer to retry an ATA command after EH
559 * @qc: Command to retry
560 *
561 * Indicate to the mid and upper layers that an ATA command
562 * should be retried. To be used from EH.
563 *
564 * SCSI midlayer limits the number of retries to scmd->allowed.
565 * scmd->retries is decremented for commands which get retried
566 * due to unrelated failures (qc->err_mask is zero).
567 */
568void ata_eh_qc_retry(struct ata_queued_cmd *qc)
569{
570 struct scsi_cmnd *scmd = qc->scsicmd;
571 if (!qc->err_mask && scmd->retries)
572 scmd->retries--;
573 __ata_eh_qc_complete(qc);
574}