Skip to content

Commit

Permalink
Do not gard cores on the initial core wakeup failure
Browse files Browse the repository at this point in the history
We have seen rare (but non-zero) errors during slave core wakeup
where we never see the new core reporting in.  Currently this
will result in a visible log and a core gard.  However, there is
currently no indication this failure is actually due to bad
hardware.

As a workaround, this commit adds an indicator that keeps track
of if a core has failed wakeup previously.  The first time we
encounter the error there will be a visible log with a FW callout
and no deconfig or gard of the core.  That will trigger a boot
failure and a reboot.  If we don't fail on the next boot (which
is expected), the counter will be cleared.  If we do fail again
there will be a visible log (with a new SRC) that calls out the
core as the primary cause, plus does a deconfig+gard.

Change-Id: I3a25537cf9c9c8e0b679519b67e9ae4e3492736d
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70993
Tested-by: Jenkins Server <[email protected]>
Tested-by: Jenkins OP Build CI <[email protected]>
Tested-by: Jenkins OP HW <[email protected]>
Reviewed-by: Daniel M. Crowell <[email protected]>
  • Loading branch information
dcrowell77 committed Jan 29, 2019
1 parent 10dbf93 commit 753249a
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 8 deletions.
16 changes: 15 additions & 1 deletion src/include/usr/isteps/istep_reasoncodes.H
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,21 @@ namespace ISTEP
RC_FAILED_TO_BOOT_SBE = ISTEP_COMP_ID | 0x38,
RC_REDISCOVERED_TARGETS = ISTEP_COMP_ID | 0x39,
RC_P9N_LESS_THAN_DD22_NOT_SUPPORTED = ISTEP_COMP_ID | 0x3A,
RC_PNOR_IPMI_NOT_ENABLED = ISTEP_COMP_ID | 0x3B,
RC_FREQ_ATTR_TIMER_EXPIRED = ISTEP_COMP_ID | 0x40,
RC_FREQ_ATTR_TIMER_THREAD_FAIL = ISTEP_COMP_ID | 0x41,
RC_FLOOR_FREQ_MISMATCH = ISTEP_COMP_ID | 0x42,
RC_CEIL_FREQ_MISMATCH = ISTEP_COMP_ID | 0x43,
RC_TURBO_FREQ_MISMATCH = ISTEP_COMP_ID | 0x44,
RC_ULTRA_TURBO_FREQ_MISMATCH = ISTEP_COMP_ID | 0x45,
RC_NEST_FREQ_MISMATCH = ISTEP_COMP_ID | 0x46,
RC_NO_VALID_MEM_CONFIG = ISTEP_COMP_ID | 0x47,
RC_MASTER_GET_SBE_BOOT_SEEPROM_FAIL = ISTEP_COMP_ID | 0x48,
RC_SLAVE_GET_SBE_BOOT_SEEPROM_FAIL = ISTEP_COMP_ID | 0x49,
RC_LINK_TRAIN_ERRORS_FROM_HWP = ISTEP_COMP_ID | 0x4A,
RC_RISK_LEVEL_TOO_LOW = ISTEP_COMP_ID | 0x4B,
RC_INVALID_HX_KEYWORD_DATA = ISTEP_COMP_ID | 0x4C,
RC_PNOR_IPMI_NOT_ENABLED = ISTEP_COMP_ID | 0x4D,
RC_SLAVE_CORE_WAKEUP_ERROR = ISTEP_COMP_ID | 0x4E,
};

};
Expand Down
97 changes: 92 additions & 5 deletions src/usr/isteps/istep16/call_host_activate_slave_cores.C
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,13 @@ void* call_host_activate_slave_cores (void *io_pArgs)
}
} // End of handle time out error

// Create error log
if (0 != rc)
// Check if this core failed last time
ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail =
(*l_core)->getAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>();

// Create predictive error log if this is the first failure
// AND the HWP didn't see a problem
if( (0 != rc) && (l_prevFail == 0) && (l_checkidle_eid == 0) )
{
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"call_host_activate_slave_cores: "
Expand Down Expand Up @@ -208,11 +213,75 @@ void* call_host_activate_slave_cores (void *io_pArgs)
l_checkidle_eid,
rc) );

// Going to assume some kind of SW error unless it fails
// again
l_errl->addProcedureCallout( HWAS::EPUB_PRC_HB_CODE,
HWAS::SRCI_PRIORITY_HIGH);

// Callout core that failed to wake up.
l_errl->addHwCallout(*l_core,
HWAS::SRCI_PRIORITY_MED,
HWAS::DECONFIG,
HWAS::GARD_Predictive);
HWAS::SRCI_PRIORITY_LOW,
HWAS::NO_DECONFIG,
HWAS::GARD_NULL);

// Could be an interrupt issue
l_errl->collectTrace(INTR_TRACE_NAME,256);

// Throw printk in there too in case it is a kernel issue
ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl);

// Add interesting ISTEP traces
l_errl->collectTrace(ISTEP_COMP_NAME,256);

l_stepError.addErrorDetails( l_errl );
errlCommit( l_errl, HWPF_COMP_ID );

// Remember that we failed so we can gard the core if it
// happens again on the reboot
l_prevFail = 1;
(*l_core)->
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);

break;
}
// Create unrecoverable error log if this is a repeat
// OR if the HWP hit something
else if( (0 != rc) &&
((l_prevFail > 0) || (l_checkidle_eid != 0)) )
{
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"call_host_activate_slave_cores: "
"Core errors during wakeup on core %x",
pir);
/*@
* @errortype
* @reasoncode RC_SLAVE_CORE_WAKEUP_ERROR
* @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE
* @moduleid MOD_HOST_ACTIVATE_SLAVE_CORES
* @userdata1[00:31] PIR of failing core.
* @userdata2[32:63] Number of previous failures.
* @userdata2[00:31] EID from p9_check_idle_stop_done().
* @userdata2[32:63] rc of cpu_start_core().
*
* @devdesc Kernel returned error when trying to activate
* core.
*/
l_errl = new ERRORLOG::ErrlEntry(
ERRORLOG::ERRL_SEV_UNRECOVERABLE,
MOD_HOST_ACTIVATE_SLAVE_CORES,
RC_SLAVE_CORE_WAKEUP_ERROR,
TWO_UINT32_TO_UINT64(
pir,
l_prevFail),
TWO_UINT32_TO_UINT64(
l_checkidle_eid,
rc) );

// Callout and gard core that failed to wake up.
l_errl->addHwCallout(*l_core,
HWAS::SRCI_PRIORITY_HIGH,
HWAS::DECONFIG,
HWAS::GARD_Predictive);

// Could be an interrupt issue
l_errl->collectTrace(INTR_TRACE_NAME,256);
Expand All @@ -225,8 +294,26 @@ void* call_host_activate_slave_cores (void *io_pArgs)

l_stepError.addErrorDetails( l_errl );
errlCommit( l_errl, HWPF_COMP_ID );

// We garded the core so we should zero out the fail
// counter so the replacement doesn't get blamed
l_prevFail = 0;
(*l_core)->
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);

break;
}
// Zero out the counter if we passed
else if( l_prevFail > 0 )
{
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"call_host_activate_slave_cores: "
"Resetting failure count for core %.8X",
TARGETING::get_huid(*l_core) );
l_prevFail = 0;
(*l_core)->
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
}
}
}
// @@@@@ END CUSTOM BLOCK: @@@@@
Expand Down
20 changes: 19 additions & 1 deletion src/usr/targeting/common/xmltohb/attribute_types.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<!-- -->
<!-- OpenPOWER HostBoot Project -->
<!-- -->
<!-- Contributors Listed Below - COPYRIGHT 2012,2018 -->
<!-- Contributors Listed Below - COPYRIGHT 2012,2019 -->
<!-- [+] International Business Machines Corp. -->
<!-- -->
<!-- -->
Expand Down Expand Up @@ -6755,6 +6755,24 @@ Selects which voltage level to place the Core and ECO domain PFETs upon Winkle e
<writeable/>
</attribute>

<attribute>
<description>
Tracks if a specific core has previously experienced a timeout during
initial activation.
0 = No previous errors reported;
1 = Core failed on the last attempt to be started
</description>
<id>PREVIOUS_WAKEUP_FAIL</id>
<persistency>non-volatile</persistency>
<readable/>
<writeable/>
<simpleType>
<uint8_t>
<default>0</default>
</uint8_t>
</simpleType>
<no_export/>
</attribute>

<attribute>
<id>SLOT_NAME</id>
Expand Down
5 changes: 4 additions & 1 deletion src/usr/targeting/common/xmltohb/target_types.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<!-- -->
<!-- OpenPOWER HostBoot Project -->
<!-- -->
<!-- Contributors Listed Below - COPYRIGHT 2012,2018 -->
<!-- Contributors Listed Below - COPYRIGHT 2012,2019 -->
<!-- [+] Google Inc. -->
<!-- [+] International Business Machines Corp. -->
<!-- -->
Expand Down Expand Up @@ -1316,6 +1316,9 @@
<default>CPU</default>
<id>CDM_DOMAIN</id>
</attribute>
<attribute>
<id>PREVIOUS_WAKEUP_FAIL</id>
</attribute>
</targetType>

<!-- MCS
Expand Down

0 comments on commit 753249a

Please sign in to comment.