/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #pragma dictionary "DISK" #define P disk fru P; asru P; /* * Over all comments for this file: * The disk-as-detector DE provides the mapping between * ereports generated by a kernel disk driver sd(4D) and resulting faults. */ /* * SERD engine for media error fault propagation: * * This strategy is designed to give a file system, like ZFS, the * ability to attempt data recovery/relocation without faulting a disk. * This implementation depends on a file system retry to the same lba * to trigger a fault when recovery/relocation is not possible. * * We let the engine propagate one error only once every 1 minute and then if we * still get 2 or more errors within 24 hours for the same LBA, * there is a fault. */ engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h; /* * disk-as-detector: fault events. */ event fault.io.scsi.cmd.disk.dev.rqs.derr@P; event fault.io.scsi.cmd.disk.dev.rqs.merr@P, engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P; /* * The uderr fault will be defined at some future time. * event fault.io.scsi.cmd.disk.dev.uderr@P; */ /* * disk-as-detector: upset events. * NOTE: For now we define an upset to implement discard. */ event upset.io.scsi.cmd.disk.dev.rqs.derr@P; event upset.io.scsi.cmd.disk.dev.rqs.merr@P; event upset.io.scsi.cmd.disk.dev.uderr@P; event upset.io.scsi.cmd.disk.dev.serr@P; event upset.io.scsi.cmd.disk.tran@P; event upset.io.scsi.cmd.disk.recovered@P; /* * disk-as-detector: ereports from the kernel. * * We don't know the topology for all scsi disks, but the kernel will always * generate ereport telemetry assuming that we do. We define these ereports * with 'discard_if_config_unknown=1', which permits ereports against things * with unknown topology to be silently discarded. The ereport data is logged * in either case, and can be viewed via 'fmdump -eV'. */ event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1; event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1; event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1; event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1; event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1; event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1; /* * For some ereports we let the 'driver-assessment', communicated as part of * the ereport payload, determine fault .vs. upset via propagation constraints. */ #define DRIVER_ASSESSMENT_FATAL \ (payloadprop_contains("driver-assessment", "fatal")) #define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL) /* * disk-as-detector: propagations from faults(based on * DRIVER_ASSESSMENT_FATAL). * We need to set additional fault payloads to indicate fault details. * The payload we may need are listed as following: * fault.io.scsi.cmd.disk.dev.rqs.derr * op_code, key, asc, ascq * fault.io.scsi.cmd.disk.dev.rqs.merr * op_code, key, asc, ascq, lba */ prop fault.io.scsi.cmd.disk.dev.rqs.derr@P-> ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL && setpayloadprop("key", payloadprop("key")) && setpayloadprop("asc", payloadprop("asc")) && setpayloadprop("ascq", payloadprop("ascq"))}; /* * Utilize setserdsuffix with specific LBA, * the serd engine would only trigger if the fault recurred on the same LBA */ prop fault.io.scsi.cmd.disk.dev.rqs.merr@P-> ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL && setserdsuffix(payloadprop("lba")) && setpayloadprop("key", payloadprop("key")) && setpayloadprop("asc", payloadprop("asc")) && setpayloadprop("ascq", payloadprop("ascq")) && setpayloadprop("lba", payloadprop("lba"))}; /* * NOTE: this propagation uses the "may" propagation of eversholt. * The ereport need never exist. It's just a way of making * the diagnosis wait for the within time on that ereport * to complete. Once it has completed the diagnosis continues * even though the dummy ereport didn't occur. */ event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)}; prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) -> ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P; /* * The uderr fault will be propagated at some future time. * prop fault.io.scsi.cmd.disk.dev.uderr@P-> * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL }; */ /* * disk-as-detector: propagations from upsets(based on * DRIVER_ASSESSMENT_NONFATAL). */ prop upset.io.scsi.cmd.disk.dev.rqs.derr@P-> ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL }; prop upset.io.scsi.cmd.disk.dev.rqs.merr@P-> ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL }; /* * disk-as-detector: propagations from upsets(independent of * driver-assessment) */ prop upset.io.scsi.cmd.disk.dev.serr@P-> ereport.io.scsi.cmd.disk.dev.serr@P; prop upset.io.scsi.cmd.disk.dev.uderr@P-> ereport.io.scsi.cmd.disk.dev.uderr@P; prop upset.io.scsi.cmd.disk.recovered@P-> ereport.io.scsi.cmd.disk.recovered@P; prop upset.io.scsi.cmd.disk.tran@P-> ereport.io.scsi.cmd.disk.tran@P; /* * -------------------------------------- * The remainder of this file contains rules associated with the operation of * cmd/fm/modules/common/disk-monitor/disk_monitor.c code. * * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events * generated by the disk-transport fmd module, and the resulting faults. */ /* * Fault events. */ event fault.io.disk.over-temperature@P, FITrate=10, FRU=P, ASRU=P; event fault.io.disk.predictive-failure@P, FITrate=10, FITrate=10, FRU=P, ASRU=P; event fault.io.disk.self-test-failure@P, FITrate=10, FITrate=10, FRU=P, ASRU=P; event fault.io.disk.ssm-wearout@P; /* * ereports. */ event ereport.io.scsi.disk.over-temperature@P; event ereport.io.scsi.disk.predictive-failure@P; event ereport.io.scsi.disk.self-test-failure@P; event ereport.io.scsi.disk.ssm-wearout@P; /* * Propagations. */ prop fault.io.disk.over-temperature@P -> ereport.io.scsi.disk.over-temperature@P; prop fault.io.disk.self-test-failure@P -> ereport.io.scsi.disk.self-test-failure@P; prop fault.io.disk.predictive-failure@P -> ereport.io.scsi.disk.predictive-failure@P { setpayloadprop("asc", payloadprop("additional-sense-code")) && setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) }; prop fault.io.disk.ssm-wearout@P -> ereport.io.scsi.disk.ssm-wearout@P { setpayloadprop("current-wearout-percentage", payloadprop("current-ssm-wearout")) && setpayloadprop("threshold-wearout-percentage", payloadprop("threshold-ssm-wearout")) };