/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #pragma dictionary "INTEL" /* * Eversholt rules for the intel CPU/Memory */ /* * Ereports for Simple error codes. */ #define SMPL_EVENT(leafclass, t) \ event ereport.cpu.intel.leafclass@chip/core/strand { within(t) } SMPL_EVENT(unknown, 1s); SMPL_EVENT(unclassified, 1s); SMPL_EVENT(microcode_rom_parity, 1s); SMPL_EVENT(external, 1s); SMPL_EVENT(frc, 1s); SMPL_EVENT(internal_timer, 1s); SMPL_EVENT(internal_parity, 1s); SMPL_EVENT(internal_unclassified, 1s); /* * Propogations for all but "external" and "unknown" simple errors. * If the error is uncorrected we produce a fault immediately, otherwise * we diagnose it to an upset and decalre a fault when the SERD engine * trips. prop statement for ereport.cpu.intel.internal_unclassified is * moved to the Nehalem EX section to deal with poison case. */ engine serd.cpu.intel.simple@chip/core/strand, N=3, T=72h; event fault.cpu.intel.internal@chip/core/strand, engine=serd.cpu.intel.simple@chip/core/strand; prop fault.cpu.intel.internal@chip/core/strand { payloadprop("error_uncorrected") == 1 ? setserdincrement(4) : 1} (0)-> ereport.cpu.intel.microcode_rom_parity@chip/core/strand, ereport.cpu.intel.internal_timer@chip/core/strand, ereport.cpu.intel.internal_parity@chip/core/strand, ereport.cpu.intel.unclassified@chip/core/strand, ereport.cpu.intel.frc@chip/core/strand; /* * Ereports for Compound error codes. These are in pairs "foo" and "foo_uc" * for the corrected and uncorrected version of each error type. All are * detected at chip/core/strand. */ #define CMPND_EVENT(leafclass, t) \ event ereport.cpu.intel.leafclass@chip/core/strand { within(t) }; \ event ereport.cpu.intel.leafclass/**/_uc@chip/core/strand { within(t) } /* * Ereports for Compound error codes - intel errors */ CMPND_EVENT(l0cache, 1s); CMPND_EVENT(l1cache, 1s); CMPND_EVENT(l2cache, 1s); CMPND_EVENT(cache, 1s); /* * Ereports for Compound error codes - TLB errors */ CMPND_EVENT(l0dtlb, 1s); CMPND_EVENT(l1dtlb, 1s); CMPND_EVENT(l2dtlb, 1s); CMPND_EVENT(dtlb, 1s); CMPND_EVENT(l0itlb, 1s); CMPND_EVENT(l1itlb, 1s); CMPND_EVENT(l2itlb, 1s); CMPND_EVENT(itlb, 1s); CMPND_EVENT(l0tlb, 1s); CMPND_EVENT(l1tlb, 1s); CMPND_EVENT(l2tlb, 1s); CMPND_EVENT(tlb, 1s); /* * Ereports for Compound error codes - memory hierarchy errors */ CMPND_EVENT(l0dcache, 1s); CMPND_EVENT(l1dcache, 1s); CMPND_EVENT(l2dcache, 1s); CMPND_EVENT(dcache, 1s); CMPND_EVENT(l0icache, 1s); CMPND_EVENT(l1icache, 1s); CMPND_EVENT(l2icache, 1s); CMPND_EVENT(icache, 1s); /* * Ereports for Compound error codes - bus and interconnect errors */ CMPND_EVENT(bus_interconnect, 1s); CMPND_EVENT(bus_interconnect_memory, 1s); CMPND_EVENT(bus_interconnect_io, 1s); /* * Compound error propogations. * * We resist the temptation propogate, for example, a single dcache fault * to all ereports mentioning dcache (l0dcache, l1dcache, l2dcache, dcache). * Instead we will diagnose a distinct fault for each possible cache level, * whether or not current chips have dcaches at all levels. * * Corrected errors are SERDed and produce a fault when the engine fires; * the same fault is diagnosed immediately for a corresponding uncorrected * error. */ #define CMPND_FLT_PROP_1(erptleaf, fltleaf, n, t) \ engine serd.cpu.intel.fltleaf@chip/core/strand, N=n, T=t; \ event fault.cpu.intel.fltleaf@chip/core/strand, \ engine=serd.cpu.intel.fltleaf@chip/core/strand; \ \ prop fault.cpu.intel.fltleaf@chip/core/strand (0)-> \ ereport.cpu.intel.erptleaf@chip/core/strand; \ \ prop fault.cpu.intel.fltleaf@chip/core/strand \ { setserdincrement(n + 1) } (0)-> \ ereport.cpu.intel.erptleaf/**/_uc@chip/core/strand #define CMPND_FLT_PROP_2(erptleaf, fltleaf, n, t) \ engine serd.cpu.intel.fltleaf@chip/core/strand, N=n, T=t; \ event fault.cpu.intel.fltleaf@chip/core/strand, retire=0, response=0,\ engine=serd.cpu.intel.fltleaf@chip/core/strand; \ \ prop fault.cpu.intel.fltleaf@chip/core/strand (0)-> \ ereport.cpu.intel.erptleaf@chip/core/strand; \ \ prop fault.cpu.intel.fltleaf@chip/core/strand \ { setserdincrement(n + 1) } (0)-> \ ereport.cpu.intel.erptleaf/**/_uc@chip/core/strand CMPND_FLT_PROP_1(l0cache, l0cache, 3, 72h); CMPND_FLT_PROP_1(l1cache, l1cache, 3, 72h); CMPND_FLT_PROP_1(l2cache, l2cache, 3, 72h); CMPND_FLT_PROP_1(cache, cache, 12, 72h); CMPND_FLT_PROP_1(l0dtlb, l0dtlb, 3, 72h); CMPND_FLT_PROP_1(l1dtlb, l1dtlb, 3, 72h); CMPND_FLT_PROP_1(l2dtlb, l2dtlb, 3, 72h); CMPND_FLT_PROP_1(dtlb, dtlb, 12, 72h); CMPND_FLT_PROP_1(l0itlb, l0itlb, 3, 72h); CMPND_FLT_PROP_1(l1itlb, l1itlb, 3, 72h); CMPND_FLT_PROP_1(l2itlb, l2itlb, 3, 72h); CMPND_FLT_PROP_1(itlb, itlb, 12, 72h); CMPND_FLT_PROP_1(l0tlb, l0tlb, 3, 72h); CMPND_FLT_PROP_1(l1tlb, l1tlb, 3, 72h); CMPND_FLT_PROP_1(l2tlb, l2tlb, 3, 72h); CMPND_FLT_PROP_1(tlb, tlb, 12, 72h); CMPND_FLT_PROP_1(l0dcache, l0dcache, 3, 72h); CMPND_FLT_PROP_1(l1dcache, l1dcache, 3, 72h); CMPND_FLT_PROP_1(l2dcache, l2dcache, 3, 72h); CMPND_FLT_PROP_1(dcache, dcache, 12, 72h); CMPND_FLT_PROP_1(l0icache, l0icache, 3, 72h); CMPND_FLT_PROP_1(l1icache, l1icache, 3, 72h); CMPND_FLT_PROP_1(l2icache, l2icache, 3, 72h); CMPND_FLT_PROP_1(icache, icache, 12, 72h); CMPND_FLT_PROP_2(bus_interconnect, bus_interconnect, 10, 72h); CMPND_FLT_PROP_2(bus_interconnect_memory, bus_interconnect_memory, 10, 72h); CMPND_FLT_PROP_2(bus_interconnect_io, bus_interconnect_io, 10, 72h); event upset.discard@chip/core/strand; event ereport.cpu.intel.unknown@chip {within(15s)}; prop upset.discard@chip/core/strand (0)-> ereport.cpu.intel.external@chip/core/strand, ereport.cpu.intel.unknown@chip/core/strand, ereport.cpu.intel.unknown@chip; /* errors detected in northbridge */ /* * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that * we diagnose for page faults, to record the physical address of the faulting * page. */ #define SET_ADDR (!payloadprop_defined("physaddr") || \ setpayloadprop("asru-physaddr", payloadprop("physaddr"))) #define SET_OFFSET (!payloadprop_defined("offset") || \ setpayloadprop("asru-offset", payloadprop("offset"))) #define EREPORT_BUS_ERROR \ ereport.cpu.intel.bus_interconnect_memory_uc@chip/core/strand, \ ereport.cpu.intel.bus_interconnect_uc@chip/core/strand, \ ereport.cpu.intel.bus_interconnect_memory@chip/core/strand, \ ereport.cpu.intel.bus_interconnect@chip/core/strand, \ ereport.cpu.intel.external@chip/core/strand engine stat.ce_pgflt@memory-controller/dram-channel/dimm; event ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller{within(12s)}; event ereport.cpu.intel.nb.ddr2_mem_ue@ motherboard/memory-controller{within(12s)}; event ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller{within(12s)}; event fault.memory.intel.page_ue@ motherboard/memory-controller/dram-channel/dimm/rank, message=0, response=0; event fault.memory.intel.dimm_ue@ motherboard/memory-controller/dram-channel/dimm/rank; prop fault.memory.intel.page_ue@ motherboard/memory-controller/dram-channel/dimm/rank[rank_num] { payloadprop_defined("rank") && rank_num == payloadprop("rank") && (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && SET_ADDR && SET_OFFSET } (1)-> ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller, ereport.cpu.intel.nb.ddr2_mem_ue@motherboard/memory-controller, ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller; prop fault.memory.intel.dimm_ue@ motherboard/memory-controller/dram-channel/dimm/rank[rank_num] { payloadprop_defined("rank") && rank_num == payloadprop("rank") } (1)-> ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller, ereport.cpu.intel.nb.ddr2_mem_ue@motherboard/memory-controller, ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller; event upset.memory.intel.discard@motherboard/memory-controller{within(1s)}; prop upset.memory.intel.discard@motherboard/memory-controller (0)-> ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller, ereport.cpu.intel.nb.ddr2_mem_ue@motherboard/memory-controller, ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller; prop upset.memory.intel.discard@motherboard/memory-controller (0)-> EREPORT_BUS_ERROR; #define PAGE_CE_COUNT 2 #define PAGE_CE_TIME 72h #define DIMM_CE_COUNT 10 #define DIMM_CE_TIME 1week #define MBDIMM motherboard/memory-controller/dram-channel/dimm event ereport.cpu.intel.nb.mem_ce@MBDIMM/rank{within(12s)}; event ereport.cpu.intel.nb.ddr2_mem_ce@MBDIMM/rank{within(12s)}; event ereport.cpu.intel.nb.ddr2_mem_ce@ motherboard/memory-controller{within(12s)}; engine serd.memory.intel.page_ce@MBDIMM/rank, N=PAGE_CE_COUNT, T=PAGE_CE_TIME; event fault.memory.intel.page_ce@MBDIMM/rank, message=0, response=0, count=stat.ce_pgflt@MBDIMM, engine=serd.memory.intel.page_ce@MBDIMM/rank; prop fault.memory.intel.page_ce@MBDIMM/rank { (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && SET_ADDR && SET_OFFSET } (0)-> ereport.cpu.intel.nb.mem_ce@MBDIMM/rank, ereport.cpu.intel.nb.ddr2_mem_ce@MBDIMM/rank; engine serd.memory.intel.dimm_ce@MBDIMM/rank, N=DIMM_CE_COUNT, T=DIMM_CE_TIME; event fault.memory.intel.dimm_ce@MBDIMM/rank, engine=serd.memory.intel.dimm_ce@MBDIMM/rank; prop fault.memory.intel.dimm_ce@MBDIMM/rank { !confprop_defined(MBDIMM, "dimm-size") && count(stat.ce_pgflt@MBDIMM) > 512 } (1)-> ereport.cpu.intel.nb.mem_ce@MBDIMM/rank, ereport.cpu.intel.nb.ddr2_mem_ce@MBDIMM/rank; #define DIMM_CE(dimm_size, n, t, fault_rate) \ prop fault.memory.intel.dimm_ce@MBDIMM/rank { \ confprop(MBDIMM, "dimm-size") == dimm_size && \ count(stat.ce_pgflt@MBDIMM) > fault_rate && \ setserdn(n) & setserdt(t) } (1)-> \ ereport.cpu.intel.nb.mem_ce@MBDIMM/rank, \ ereport.cpu.intel.nb.ddr2_mem_ce@MBDIMM/rank; DIMM_CE("8G", 8, 1week, 2000) DIMM_CE("4G", 4, 1week, 1500) DIMM_CE("2G", 4, 2week, 1000) DIMM_CE("1G", 4, 4week, 500) DIMM_CE("512M", 4, 8week, 250) DIMM_CE("256M", 4, 16week, 125) prop upset.memory.intel.discard@motherboard/memory-controller (0)-> ereport.cpu.intel.nb.ddr2_mem_ce@motherboard/memory-controller; event ereport.cpu.intel.nb.fbd.alert@rank{within(12s)}; event fault.memory.intel.fbd.alert@rank, retire=0; prop fault.memory.intel.fbd.alert@rank (1)-> ereport.cpu.intel.nb.fbd.alert@rank; prop fault.memory.intel.fbd.alert@rank (0)-> EREPORT_BUS_ERROR; event ereport.cpu.intel.nb.fbd.crc@rank{within(12s)}; event fault.memory.intel.fbd.crc@rank, retire=0; prop fault.memory.intel.fbd.crc@rank (1)-> ereport.cpu.intel.nb.fbd.crc@rank; prop fault.memory.intel.fbd.crc@rank (0)-> EREPORT_BUS_ERROR; event ereport.cpu.intel.nb.fbd.reset_timeout@memory-controller {within(12s)}; event fault.memory.intel.fbd.reset_timeout@memory-controller, retire=0; prop fault.memory.intel.fbd.reset_timeout@memory-controller (1)-> ereport.cpu.intel.nb.fbd.reset_timeout@memory-controller; prop fault.memory.intel.fbd.reset_timeout@memory-controller (0)-> EREPORT_BUS_ERROR; event ereport.cpu.intel.nb.fbd.ch@dram-channel {within(12s)}; engine serd.cpu.intel.nb.fbd.ch@dram-channel, N=2, T=1month; event fault.memory.intel.fbd.ch@dram-channel, retire=0, engine=serd.cpu.intel.nb.fbd.ch@dram-channel; prop fault.memory.intel.fbd.ch@dram-channel (1)-> ereport.cpu.intel.nb.fbd.ch@dram-channel; prop fault.memory.intel.fbd.ch@dram-channel (0)-> EREPORT_BUS_ERROR; event ereport.cpu.intel.nb.fbd.otf@dram-channel {within(12s)}; engine serd.cpu.intel.nb.fbd_otf@dram-channel, N=2, T=1week; event fault.memory.intel.fbd.otf@dram-channel, retire=0, response=0, engine=serd.cpu.intel.nb.fbd_otf@dram-channel; prop fault.memory.intel.fbd.otf@dram-channel (1)-> ereport.cpu.intel.nb.fbd.otf@dram-channel; event ereport.cpu.intel.nb.otf@motherboard {within(12s)}; event fault.cpu.intel.nb.otf@motherboard, retire=0, response=0; prop fault.cpu.intel.nb.otf@motherboard (1)-> ereport.cpu.intel.nb.otf@motherboard; event ereport.cpu.intel.nb.unknown@motherboard {within(12s)}; event ereport.cpu.intel.nb.unknown@memory-controller {within(12s)}; event ereport.cpu.intel.nb.unknown@memory-controller/dram-channel {within(12s)}; event ereport.cpu.intel.nb.spd@memory-controller/dram-channel {within(12s)}; event ereport.cpu.intel.nb.ddr2_spd@ memory-controller/dram-channel {within(12s)}; event upset.discard@memory-controller; prop upset.discard@memory-controller (0)-> ereport.cpu.intel.nb.unknown@motherboard, ereport.cpu.intel.nb.unknown@memory-controller, ereport.cpu.intel.nb.unknown@memory-controller/dram-channel, ereport.cpu.intel.nb.spd@memory-controller/dram-channel, ereport.cpu.intel.nb.ddr2_spd@memory-controller/dram-channel; event ereport.cpu.intel.nb.mem_ds@memory-controller{within(30s)}; event ereport.cpu.intel.nb.ddr2_mem_ds@memory-controller{within(30s)}; event fault.memory.intel.fbd.mem_ds@memory-controller/dram-channel/dimm/rank, retire=0; prop fault.memory.intel.fbd.mem_ds@ memory-controller/dram-channel/dimm/rank[rank_num] { payloadprop_defined("rank") && rank_num == payloadprop("rank") } (1)-> ereport.cpu.intel.nb.mem_ds@memory-controller, ereport.cpu.intel.nb.ddr2_mem_ds@memory-controller; event ereport.cpu.intel.nb.fsb@chip{within(12s)}; event fault.cpu.intel.nb.fsb@chip, retire=0; prop fault.cpu.intel.nb.fsb@chip (1)-> ereport.cpu.intel.nb.fsb@chip; prop fault.cpu.intel.nb.fsb@chip (0)-> EREPORT_BUS_ERROR; event ereport.cpu.intel.nb.ie@motherboard{within(12s)}; event fault.cpu.intel.nb.ie@motherboard, retire=0; event upset.cpu.intel.nb.ie_ce@motherboard{within(12s)}; prop upset.cpu.intel.nb.ie_ce@motherboard { payloadprop("intel-error-list") == "B6" } (0)-> ereport.cpu.intel.nb.ie@motherboard; prop fault.cpu.intel.nb.ie@motherboard { payloadprop("intel-error-list") != "B6" } (1)-> ereport.cpu.intel.nb.ie@motherboard; prop fault.cpu.intel.nb.ie@motherboard (0)-> EREPORT_BUS_ERROR; event ereport.cpu.intel.nb.dma@motherboard{within(12s)}; event upset.cpu.intel.nb.dma@motherboard; prop upset.cpu.intel.nb.dma@motherboard (1)-> ereport.cpu.intel.nb.dma@motherboard; event ereport.cpu.intel.nb.esi@motherboard{within(12s)}; event ereport.cpu.intel.nb.pex@hostbridge{within(12s)}; event upset.cpu.intel.nb.pex@hostbridge; prop upset.cpu.intel.nb.pex@hostbridge (1)-> ereport.cpu.intel.nb.esi@motherboard, ereport.cpu.intel.nb.pex@hostbridge; prop upset.cpu.intel.nb.pex@hostbridge (0)-> EREPORT_BUS_ERROR; event ereport.cpu.intel.nb.unknown@rank{within(12s)}; event upset.discard@rank; prop upset.discard@rank (1)-> ereport.cpu.intel.nb.unknown@rank; prop upset.discard@rank (0)-> EREPORT_BUS_ERROR; /* * CPU integrated memory controller */ #define CONTAINS_RANK (payloadprop_contains("resource", \ asru(chip/memory-controller/dram-channel/dimm/rank)) || \ payloadprop_contains("resource", \ asru(chip/memory-controller/dram-channel/dimm))) #define STAT_CPU_MEM_CE_PGFLTS \ stat.ce_pgflt@chip/memory-controller/dram-channel/dimm #define SET_RES_OFFSET \ (!payloadprop_defined("resource[0].hc-specific.offset") || \ setpayloadprop("asru-offset", \ payloadprop("resource[0].hc-specific.offset"))) engine STAT_CPU_MEM_CE_PGFLTS; event ereport.cpu.intel.quickpath.mem_ue@chip/memory-controller {within(12s)}, discard_if_config_unknown=1; event fault.memory.intel.page_ue@ chip/memory-controller/dram-channel/dimm/rank, message=0, response=0; /* do not message individual pageflts */ prop fault.memory.intel.page_ue@ chip/memory-controller/dram-channel/dimm/rank { CONTAINS_RANK && (payloadprop_defined("physaddr") || payloadprop_defined("resource[0].hc-specific.offset")) && SET_ADDR && SET_RES_OFFSET } (0)-> ereport.cpu.intel.quickpath.mem_ue@chip/memory-controller; #define CHIPDIMM chip/memory-controller/dram-channel/dimm event fault.memory.intel.dimm_ue@CHIPDIMM/rank; event error.memory.intel.dimm_ue_ep@CHIPDIMM/rank; event error.memory.intel.dimm_ue_ex@CHIPDIMM/rank; prop fault.memory.intel.dimm_ue@CHIPDIMM/rank (1)-> error.memory.intel.dimm_ue_ep@CHIPDIMM/rank, error.memory.intel.dimm_ue_ex@CHIPDIMM/rank; prop error.memory.intel.dimm_ue_ep@CHIPDIMM/rank { CONTAINS_RANK } (1)-> ereport.cpu.intel.quickpath.mem_ue@chip/memory-controller; prop fault.memory.intel.dimm_ue@CHIPDIMM/rank (0)-> EREPORT_BUS_ERROR; event ereport.cpu.intel.quickpath.mem_ce@ chip/memory-controller {within(12s)}, discard_if_config_unknown=1; engine serd.memory.intel.page_ce@CHIPDIMM/rank, N=PAGE_CE_COUNT, T=PAGE_CE_TIME; event fault.memory.intel.page_ce@CHIPDIMM/rank, message=0, response=0, count=STAT_CPU_MEM_CE_PGFLTS, engine=serd.memory.intel.page_ce@CHIPDIMM/rank; prop fault.memory.intel.page_ce@CHIPDIMM/rank { CONTAINS_RANK && (payloadprop_defined("physaddr") || payloadprop_defined("resource[0].hc-specific.offset")) && SET_ADDR && SET_RES_OFFSET } (0)-> ereport.cpu.intel.quickpath.mem_ce@chip/memory-controller; engine serd.memory.intel.dimm_ce@CHIPDIMM, N=PAGE_CE_COUNT, T=PAGE_CE_TIME; event fault.memory.intel.dimm_ce@CHIPDIMM, engine=serd.memory.intel.dimm_ce@CHIPDIMM; prop fault.memory.intel.dimm_ce@CHIPDIMM { !confprop_defined(CHIPDIMM, "dimm-size") && count(STAT_CPU_MEM_CE_PGFLTS) > 512 } (0)-> ereport.cpu.intel.quickpath.mem_ce@chip/memory-controller; #define CPU_MEM_DIMM_CE(dimm_size, n, t, fault_rate) \ prop fault.memory.intel.dimm_ce@CHIPDIMM { \ confprop(CHIPDIMM, "dimm-size") == dimm_size && \ count(STAT_CPU_MEM_CE_PGFLTS) > fault_rate && \ setserdn(n) & setserdt(t) } (0)-> \ ereport.cpu.intel.quickpath.mem_ce@ \ chip/memory-controller; CPU_MEM_DIMM_CE("16G", 16, 1week, 2000) CPU_MEM_DIMM_CE("8G", 8, 1week, 2000) CPU_MEM_DIMM_CE("4G", 4, 1week, 1500) CPU_MEM_DIMM_CE("2G", 4, 2week, 1000) CPU_MEM_DIMM_CE("1G", 4, 4week, 500) CPU_MEM_DIMM_CE("512M", 4, 8week, 250) event ereport.cpu.intel.quickpath.mem_unknown@chip/memory-controller {within(12s)}, discard_if_config_unknown=1; event ereport.cpu.intel.quickpath.mem_unknown@ chip/memory-controller/dram-channel {within(12s)}, discard_if_config_unknown=1; event ereport.cpu.intel.quickpath.mem_unknown@ chip/memory-controller/dram-channel/dimm/rank{within(12s)}; event upset.discard@chip/memory-controller; event upset.discard@chip/memory-controller/dram-channel/dimm/rank; prop upset.discard@chip/memory-controller (0)-> ereport.cpu.intel.quickpath.mem_unknown@chip/memory-controller, ereport.cpu.intel.quickpath.mem_unknown@ chip/memory-controller/dram-channel; prop upset.discard@ chip/memory-controller/dram-channel/dimm/rank (1)-> ereport.cpu.intel.quickpath.mem_unknown@ chip/memory-controller/dram-channel/dimm/rank; event ereport.cpu.intel.quickpath.mem_parity@chip/memory-controller {within(1s)}, discard_if_config_unknown=1; event fault.cpu.intel.quickpath.mem_parity@chip/memory-controller; prop fault.cpu.intel.quickpath.mem_parity@chip/memory-controller (1)-> ereport.cpu.intel.quickpath.mem_parity@chip/memory-controller; event ereport.cpu.intel.quickpath.mem_addr_parity@chip/memory-controller {within(1s)}, discard_if_config_unknown=1; event fault.cpu.intel.quickpath.mem_addr_parity@ chip/memory-controller; event fault.cpu.intel.quickpath.mem_addr_parity@CHIPDIMM; event fault.cpu.intel.quickpath.mem_addr_parity@CHIPDIMM/rank; prop fault.cpu.intel.quickpath.mem_addr_parity@ chip/memory-controller (1)-> ereport.cpu.intel.quickpath.mem_addr_parity@chip/memory-controller; prop fault.cpu.intel.quickpath.mem_addr_parity@CHIPDIMM { payloadprop_contains("resource", asru(CHIPDIMM)) } (1)-> ereport.cpu.intel.quickpath.mem_addr_parity@chip/memory-controller; prop fault.cpu.intel.quickpath.mem_addr_parity@CHIPDIMM/rank { payloadprop_contains("resource", asru(CHIPDIMM/rank)) } (1)-> ereport.cpu.intel.quickpath.mem_addr_parity@chip/memory-controller; event ereport.cpu.intel.quickpath.mem_bad_addr@chip/memory-controller {within(1s)}, discard_if_config_unknown=1; event fault.cpu.intel.quickpath.mem_bad_addr@chip/memory-controller; prop fault.cpu.intel.quickpath.mem_bad_addr@chip/memory-controller (1)-> ereport.cpu.intel.quickpath.mem_bad_addr@chip/memory-controller; event ereport.cpu.intel.quickpath.mem_spare@chip/memory-controller {within(1s)}, discard_if_config_unknown=1; event fault.cpu.intel.quickpath.mem_spare@ chip/memory-controller/dram-channel/dimm; prop fault.cpu.intel.quickpath.mem_spare@ chip/memory-controller/dram-channel/dimm (1)-> ereport.cpu.intel.quickpath.mem_spare@chip/memory-controller; event ereport.cpu.intel.quickpath.mem_bad_id@chip/memory-controller {within(1s)}, discard_if_config_unknown=1; event fault.cpu.intel.quickpath.mem_bad_id@chip/memory-controller; prop fault.cpu.intel.quickpath.mem_bad_id@chip/memory-controller (1)-> ereport.cpu.intel.quickpath.mem_bad_id@chip/memory-controller; event ereport.cpu.intel.quickpath.mem_redundant@chip/memory-controller {within(1s)}, discard_if_config_unknown=1; engine serd.cpu.intel.quickpath.mem_redundant@CHIPDIMM, N=2, T=72h; event fault.cpu.intel.quickpath.mem_redundant@CHIPDIMM, engine=serd.cpu.intel.quickpath.mem_redundant@CHIPDIMM; event error.cpu.intel.quickpath.mem_redundant@CHIPDIMM/rank; prop fault.cpu.intel.quickpath.mem_redundant@CHIPDIMM (1)-> error.cpu.intel.quickpath.mem_redundant@CHIPDIMM/rank<>; prop error.cpu.intel.quickpath.mem_redundant@CHIPDIMM/rank { CONTAINS_RANK } (1)-> ereport.cpu.intel.quickpath.mem_redundant@ chip/memory-controller; #define STATUS_UC (payloadprop("error_uncorrected") == 1) event ereport.cpu.intel.quickpath.interconnect@chip {within(1s)}; event upset.cpu.intel.quickpath.interconnect@chip; /* Diagnose corrected events to upsets */ prop upset.cpu.intel.quickpath.interconnect@chip { !STATUS_UC } (1)-> ereport.cpu.intel.quickpath.interconnect@chip; engine serd.cpu.intel.quickpath.interconnect@chip, N=3, T=72h; event fault.cpu.intel.quickpath.interconnect@chip, engine=serd.cpu.intel.quickpath.interconnect@chip; /* Diagnose uncorrected events to faults */ prop fault.cpu.intel.quickpath.interconnect@chip { STATUS_UC } (0)-> ereport.cpu.intel.quickpath.interconnect@chip; /* * Nehalem EX specific rules */ /* MBox errors */ #define EX_MEM_EVENT(leafclass, t) \ event ereport.cpu.intel.quickpath.leafclass@ \ chip/memory-controller { within(t) }, discard_if_config_unknown=1 EX_MEM_EVENT(mem_lnktrns, 1s); EX_MEM_EVENT(mem_lnkpers, 1s); EX_MEM_EVENT(mem_sbfbdlinkerr, 1s); EX_MEM_EVENT(mem_nbfbdlnkerr, 1s); EX_MEM_EVENT(mem_lnkcrcvld, 1s); engine serd.cpu.intel.quickpath.mem_link_ce@chip/memory-controller, N=500, T=1week; event fault.cpu.intel.quickpath.mem_link_ce@chip/memory-controller, engine=serd.cpu.intel.quickpath.mem_link_ce@chip/memory-controller, retire=0, response=0; prop fault.cpu.intel.quickpath.mem_link_ce@chip/memory-controller -> ereport.cpu.intel.quickpath.mem_lnktrns@chip/memory-controller, ereport.cpu.intel.quickpath.mem_lnkpers@chip/memory-controller, ereport.cpu.intel.quickpath.mem_sbfbdlinkerr@chip/memory-controller, ereport.cpu.intel.quickpath.mem_nbfbdlnkerr@chip/memory-controller, ereport.cpu.intel.quickpath.mem_lnkcrcvld@chip/memory-controller; EX_MEM_EVENT(mem_lnkuncorr_uc, 1s); EX_MEM_EVENT(mem_lnkpers_uc, 1s); EX_MEM_EVENT(mem_sbfbdlinkerr_uc, 1s); EX_MEM_EVENT(mem_nbfbdlnkerr_uc, 1s); EX_MEM_EVENT(mem_lnkcrcvld_uc, 1s); event fault.cpu.intel.quickpath.mem_link_ue@chip/memory-controller, retire=0; prop fault.cpu.intel.quickpath.mem_link_ue@chip/memory-controller -> ereport.cpu.intel.quickpath.mem_lnkuncorr_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_lnkpers_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_sbfbdlinkerr_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_nbfbdlnkerr_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_lnkcrcvld_uc@chip/memory-controller; EX_MEM_EVENT(mem_ptrl_fsm_err, 1s); EX_MEM_EVENT(mem_errflw_fsm_fail, 1s); EX_MEM_EVENT(mem_vberr, 1s); engine serd.cpu.intel.quickpath.mem_controller_ce@chip/memory-controller, N=500, T=1week; event fault.cpu.intel.quickpath.mem_controller_ce@chip/memory-controller, engine=serd.cpu.intel.quickpath.mem_controller_ce@chip/memory-controller, retire=0, response=0; prop fault.cpu.intel.quickpath.mem_controller_ce@chip/memory-controller -> ereport.cpu.intel.quickpath.mem_ptrl_fsm_err@chip/memory-controller, ereport.cpu.intel.quickpath.mem_errflw_fsm_fail@chip/memory-controller, ereport.cpu.intel.quickpath.mem_vberr@chip/memory-controller; EX_MEM_EVENT(mem_ptrl_fsm_err_uc, 1s); EX_MEM_EVENT(mem_errflw_fsm_fail_uc, 1s); EX_MEM_EVENT(mem_mcpar_fsmerr_uc, 1s); EX_MEM_EVENT(mem_vberr_uc, 1s); EX_MEM_EVENT(mem_fberr_uc, 1s); event fault.cpu.intel.quickpath.mem_controller_ue@chip/memory-controller, retire=0; prop fault.cpu.intel.quickpath.mem_controller_ue@chip/memory-controller -> ereport.cpu.intel.quickpath.mem_ptrl_fsm_err_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_errflw_fsm_fail_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_mcpar_fsmerr_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_vberr_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_fberr_uc@chip/memory-controller; EX_MEM_EVENT(mem_scrubbing_uc, 1s); event fault.cpu.intel.quickpath.mem_scrubbing@ chip/memory-controller/dram-channel/dimm/rank, response=0; prop fault.cpu.intel.quickpath.mem_scrubbing@ chip/memory-controller/dram-channel/dimm/rank[rank_num] { payloadprop_defined("rank") && rank_num == payloadprop("rank") && (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && SET_ADDR && SET_OFFSET } (1)-> ereport.cpu.intel.quickpath.mem_scrubbing_uc@chip/memory-controller; EX_MEM_EVENT(mem_ecc_uc, 12s); EX_MEM_EVENT(mem_even_parity_uc, 1s); EX_MEM_EVENT(mem_ecc, 12s); EX_MEM_EVENT(mem_even_parity, 1s); event error.memory.intel.ex_dimm_ce@ chip/memory-controller/dram-channel/dimm/rank; prop fault.memory.intel.page_ue@ chip/memory-controller/dram-channel/dimm/rank[rank_num] { payloadprop_defined("rank") && rank_num == payloadprop("rank") && (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && SET_ADDR && SET_OFFSET } (0)-> ereport.cpu.intel.quickpath.mem_ecc_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_even_parity_uc@chip/memory-controller; prop fault.memory.intel.page_ce@ chip/memory-controller/dram-channel/dimm/rank[rank_num] { payloadprop_defined("rank") && rank_num == payloadprop("rank") && (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && SET_ADDR && SET_OFFSET } (0)-> ereport.cpu.intel.quickpath.mem_ecc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_even_parity@chip/memory-controller; prop error.memory.intel.dimm_ue_ex@ chip/memory-controller/dram-channel/dimm/rank[rank_num] { payloadprop_defined("rank") && rank_num == payloadprop("rank") } (1)-> ereport.cpu.intel.quickpath.mem_ecc_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_even_parity_uc@chip/memory-controller; prop fault.memory.intel.dimm_ce@ chip/memory-controller/dram-channel/dimm { !confprop_defined(chip/memory-controller/dram-channel/dimm, "dimm-size") && setserdn(10) & setserdt(1week) } (0)-> error.memory.intel.ex_dimm_ce@ chip/memory-controller/dram-channel/dimm/rank; prop error.memory.intel.ex_dimm_ce@ chip/memory-controller/dram-channel/dimm/rank[rank_num] { payloadprop_defined("rank") && rank_num == payloadprop("rank") && !confprop_defined(chip/memory-controller/dram-channel/dimm, "dimm-size") && count(STAT_CPU_MEM_CE_PGFLTS) > 512 } (1)-> ereport.cpu.intel.quickpath.mem_ecc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_even_parity@chip/memory-controller; #define EX_CPU_MEM_DIMM_CE(dimm_size, n, t, fault_rate) \ prop fault.memory.intel.dimm_ce@ \ chip/memory-controller/dram-channel/dimm { \ confprop(chip/memory-controller/dram-channel/dimm, \ "dimm-size") == dimm_size && \ setserdn(n) & setserdt(t) } (0)-> \ error.memory.intel.ex_dimm_ce@ \ chip/memory-controller/dram-channel/dimm/rank; \ prop error.memory.intel.ex_dimm_ce@ \ chip/memory-controller/dram-channel/dimm/rank[rank_num] { \ payloadprop_defined("rank") && rank_num == payloadprop("rank") && \ confprop(chip/memory-controller/dram-channel/dimm, \ "dimm-size") == dimm_size && \ count(STAT_CPU_MEM_CE_PGFLTS) > fault_rate } (1)-> \ ereport.cpu.intel.quickpath.mem_ecc@chip/memory-controller, \ ereport.cpu.intel.quickpath.mem_even_parity@chip/memory-controller; EX_CPU_MEM_DIMM_CE("16G", 16, 1week, 2000) EX_CPU_MEM_DIMM_CE("8G", 8, 1week, 2000) EX_CPU_MEM_DIMM_CE("4G", 4, 1week, 1500) EX_CPU_MEM_DIMM_CE("2G", 4, 2week, 1000) EX_CPU_MEM_DIMM_CE("1G", 4, 4week, 500) event upset.memory.intel.discard@chip/memory-controller{within(1s)}; prop upset.memory.intel.discard@chip/memory-controller (0)-> ereport.cpu.intel.quickpath.mem_scrubbing_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_ecc_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_even_parity_uc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_ecc@chip/memory-controller, ereport.cpu.intel.quickpath.mem_even_parity@chip/memory-controller; EX_MEM_EVENT(mem_failover_mir, 1s); event fault.cpu.intel.quickpath.mem_failover_mir@chip/memory-controller, retire=0; prop fault.cpu.intel.quickpath.mem_failover_mir@chip/memory-controller -> ereport.cpu.intel.quickpath.mem_failover_mir@chip/memory-controller; /* * RBox errors */ #define EX_EVENT(leafclass, t) \ event ereport.cpu.intel.quickpath.leafclass@chip { within(t) } engine serd.cpu.intel.quickpath.bus_interconnect@chip, N=3, T=72h; event fault.cpu.intel.quickpath.bus_interconnect@chip, engine=serd.cpu.intel.quickpath.bus_interconnect@chip, retire=0; EX_EVENT(bus_retry_abort, 1s); EX_EVENT(bus_link_init_ce, 1s); event upset.cpu.intel.quickpath.discard@chip; prop upset.cpu.intel.quickpath.discard@chip (0)-> ereport.cpu.intel.quickpath.bus_retry_abort@chip, ereport.cpu.intel.quickpath.bus_link_init_ce@chip; EX_EVENT(bus_unknown, 1s); EX_EVENT(bus_single_ecc, 1s); EX_EVENT(bus_crc_flit, 1s); prop fault.cpu.intel.quickpath.bus_interconnect@chip (0)-> ereport.cpu.intel.quickpath.bus_unknown@chip, ereport.cpu.intel.quickpath.bus_single_ecc@chip, ereport.cpu.intel.quickpath.bus_crc_flit@chip; EX_EVENT(bus_unknown_external, 1s); EX_EVENT(bus_crc_flit_external, 1s); prop upset.cpu.intel.quickpath.discard@chip (0)-> ereport.cpu.intel.quickpath.bus_unknown_external@chip, ereport.cpu.intel.quickpath.bus_crc_flit_external@chip; EX_EVENT(bus_unknown_uc, 1s); EX_EVENT(bus_opr_poison_err, 1s); EX_EVENT(bus_eot_parity, 1s); EX_EVENT(bus_rta_parity, 1s); EX_EVENT(bus_bad_sbu_route, 1s); EX_EVENT(bus_bad_msg, 1s); EX_EVENT(bus_bad_vn_credit, 1s); EX_EVENT(bus_hdr_double_ecc, 1s); EX_EVENT(bus_link_retry_err, 1s); prop fault.cpu.intel.quickpath.bus_interconnect@chip { setserdincrement(4) } (0)-> ereport.cpu.intel.quickpath.bus_unknown_uc@chip, ereport.cpu.intel.quickpath.bus_opr_poison_err@chip, ereport.cpu.intel.quickpath.bus_eot_parity@chip, ereport.cpu.intel.quickpath.bus_rta_parity@chip, ereport.cpu.intel.quickpath.bus_bad_sbu_route@chip, ereport.cpu.intel.quickpath.bus_bad_msg@chip, ereport.cpu.intel.quickpath.bus_bad_vn_credit@chip, ereport.cpu.intel.quickpath.bus_hdr_double_ecc@chip, ereport.cpu.intel.quickpath.bus_link_retry_err@chip; EX_EVENT(bus_unknown_uc_external, 1s); EX_EVENT(bus_opr_poison_err_external, 1s); EX_EVENT(bus_eot_parity_external, 1s); EX_EVENT(bus_rta_parity_external, 1s); EX_EVENT(bus_bad_sbu_route_external, 1s); EX_EVENT(bus_bad_msg_external, 1s); EX_EVENT(bus_bad_vn_credit_external, 1s); EX_EVENT(bus_hdr_double_ecc_external, 1s); EX_EVENT(bus_link_retry_err_external, 1s); prop upset.cpu.intel.quickpath.discard@chip (0)-> ereport.cpu.intel.quickpath.bus_unknown_uc_external@chip, ereport.cpu.intel.quickpath.bus_opr_poison_err_external@chip, ereport.cpu.intel.quickpath.bus_eot_parity_external@chip, ereport.cpu.intel.quickpath.bus_rta_parity_external@chip, ereport.cpu.intel.quickpath.bus_bad_sbu_route_external@chip, ereport.cpu.intel.quickpath.bus_bad_msg_external@chip, ereport.cpu.intel.quickpath.bus_bad_vn_credit_external@chip, ereport.cpu.intel.quickpath.bus_hdr_double_ecc_external@chip, ereport.cpu.intel.quickpath.bus_link_retry_err_external@chip; /* * CBox errors */ EX_EVENT(llc_ewb_uc, 1s); event fault.cpu.intel.quickpath.llc_ewb@chip, retire=0, response=0; prop fault.cpu.intel.quickpath.llc_ewb@chip { (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && SET_ADDR && SET_OFFSET } (1)-> ereport.cpu.intel.quickpath.llc_ewb_uc@chip; prop upset.cpu.intel.quickpath.discard@chip (0)-> ereport.cpu.intel.quickpath.llc_ewb_uc@chip; /* * SBox errors */ EX_EVENT(system_cache_uc, 1s); event fault.cpu.intel.quickpath.system_cache@chip, retire=0, response=0; prop fault.cpu.intel.quickpath.system_cache@chip -> ereport.cpu.intel.quickpath.system_cache_uc@chip; /* * BBox errors */ EX_EVENT(home_agent_uc, 1s); event fault.cpu.intel.quickpath.home_agent@chip, retire=0, response=0; prop fault.cpu.intel.quickpath.home_agent@chip -> ereport.cpu.intel.quickpath.home_agent_uc@chip; /* * UBox errors */ EX_EVENT(sys_cfg_cfa_ecc, 1s); EX_EVENT(sys_cfg_uc, 1s); engine serd.cpu.intel.quickpath.sys_cfg@chip, N=2, T=72h; event fault.cpu.intel.quickpath.sys_cfg@chip, engine=serd.cpu.intel.quickpath.sys_cfg@chip, retire=0, response=0; prop fault.cpu.intel.quickpath.sys_cfg@chip (0)-> ereport.cpu.intel.quickpath.sys_cfg_cfa_ecc@chip; prop fault.cpu.intel.quickpath.sys_cfg@chip { setserdincrement(3) } (0)-> ereport.cpu.intel.quickpath.sys_cfg_uc@chip; /* * Handling poison errors */ engine stat.has_poison@motherboard; event fault.cpu.intel.has_poison@motherboard, count=stat.has_poison@motherboard[0], message=0, retire=0, response=0; engine stat.discard_fatal@motherboard; event fault.cpu.intel.discard_fatal@motherboard, count=stat.discard_fatal@motherboard[0], message=0, retire=0, response=0; prop fault.cpu.intel.has_poison@motherboard { payloadprop_defined("poison") && 1 == payloadprop("poison") } (1)-> ereport.cpu.intel.quickpath.mem_scrubbing_uc@chip<>/memory-controller<>, ereport.cpu.intel.quickpath.llc_ewb_uc@chip<>, ereport.cpu.intel.quickpath.system_cache_uc@chip<>, ereport.cpu.intel.quickpath.bus_opr_poison_err@chip<>, ereport.cpu.intel.quickpath.bus_opr_poison_err_external@chip<>; prop fault.cpu.intel.discard_fatal@motherboard { count(stat.has_poison@motherboard[0]) > count(stat.discard_fatal@motherboard[0]) && payloadprop_defined("bank_number") && 5 == payloadprop("bank_number") && payloadprop_defined("processor_context_corrupt") && 1 == payloadprop("processor_context_corrupt") } (0)-> ereport.cpu.intel.internal_unclassified@chip<>/core<>/strand<> {within(10s)}; prop fault.cpu.intel.internal@chip/core/strand { (count(stat.has_poison@motherboard[0]) <= count(stat.discard_fatal@motherboard[0]) || !payloadprop_defined("bank_number") || 5 != payloadprop("bank_number") || !payloadprop_defined("processor_context_corrupt") || 0 == payloadprop("processor_context_corrupt")) && (payloadprop("error_uncorrected") == 1 ? setserdincrement(4) : 1) } (0)-> ereport.cpu.intel.internal_unclassified@chip/core/strand;