Skip to content

Commit

Permalink
Fix subtle issue in TPNOTACID_CHECK macro (caused insert_region.c ass…
Browse files Browse the repository at this point in the history
…ert failure)

Background
----------
* Below is a first-time failure, when running the `r126/ydb464` subtest (from the YDBTest project), that
  I noticed while trying to reproduce some other failure.

  ```diff
  --- ydb464/ydb464.diff ---
  19a20,73
  > r126_0_31/ydb464/simpleapi2/child98118.log
  > %YDB-F-ASSERT, Assert failed in sr_port/insert_region.c line 110 for expression ((CDB_STAGNATE > t_tries) || (dollar_tlevel && csa->now_crit))
  ```

* The C-stack and relevant variables from the core file are pasted below.

  ```c
  (gdb) where
  #0  pthread_kill () from /usr/lib64/libpthread.so.0
  #1  gtm_dump_core () at sr_unix/gtm_dump_core.c:74
  #2  gtm_fork_n_core () at sr_unix/gtm_fork_n_core.c:163
  #3  ch_cond_core () at sr_unix/ch_cond_core.c:80
  #4  rts_error_va (csa=0x0, argcnt=7, var=0x7ffee07f7480) at sr_unix/rts_error.c:198
  #5  rts_error_csa (csa=0x0, argcnt=7) at sr_unix/rts_error.c:99
  #6  insert_region (reg=0x14d0170, reg_list=0x7ff49179f158 <tp_reg_list>, reg_free_list=0x7ff49179f078 <tp_reg_free_list>, size=40) at sr_port/insert_region.c:110
  #7  mlk_unlock (p=0x1591940) at sr_port/mlk_unlock.c:70
  #8  tp_unwind (newlevel=0, invocation_type=ROLLBACK_INVOCATION, tprestart_rc=0x0) at sr_port/tp_unwind.c:294
  #9  op_trollback (rb_levels=0) at sr_port/op_trollback.c:200
  #10 secshr_db_clnup (secshr_state=NORMAL_TERMINATION) at sr_port/secshr_db_clnup.c:569
  #11 gtm_exit_handler () at sr_unix/gtm_exit_handler.c:230
  #12 signal_exit_handler (exit_handler_name=0x7ff4913b071e "deferred_exit_handler", sig=2, info=0x7ff491795458 <stapi_signal_handler_oscontext+3224>, context=0x7ff4917954d8 <stapi_signal_handler_oscontext+3352>, is_deferred_exit=1) at sr_unix/signal_exit_handler.c:78
  #13 deferred_exit_handler () at sr_unix/deferred_exit_handler.c:120
  #14 deferred_signal_handler () at sr_port/deferred_signal_handler.c:74
  #15 rel_crit (reg=0x14d0170) at sr_unix/rel_crit.c:81
  #16 mlk_lock (p=0x1591940, auxown=0, new=1) at sr_port/mlk_lock.c:120
  #17 op_lock2_common (timeout=0, laflag=64 '@') at sr_port/op_lock2.c:242
  #18 op_incrlock_common (timeout=0) at sr_port/op_incrlock.c:49
  #19 ydb_lock_incr_s (timeout_nsec=0, varname=0x7ffee07f8c30, subs_used=0, subsarray=0x0) at sr_unix/ydb_lock_incr_s.c:91
  #20 runProc (settings=0x7ffee07fab80, curDepth=1) at simpleapi/inref/randomWalk.c:489
  #21 tpHelper (tpfnparm=0x7ffee07fa100) at simpleapi/inref/randomWalk.c:691
  #22 ydb_tp_s_common (lydbrtn=LYDB_RTN_TP, tpfn=0x4037c2 <tpHelper>, tpfnparm=0x7ffee07fa100, transid=0x4041f9 "BATCH", namecount=0, varnames=0x0) at sr_unix/ydb_tp_s_common.c:256
  #23 ydb_tp_s (tpfn=0x4037c2 <tpHelper>, tpfnparm=0x7ffee07fa100, transid=0x4041f9 "BATCH", namecount=0, varnames=0x0) at sr_unix/ydb_tp_s.c:38
  #24 runProc (settings=0x7ffee07fab80, curDepth=0) at simpleapi/inref/randomWalk.c:666
  #25 runProc_driver (settings=0x7ffee07fab80) at simpleapi/inref/randomWalk.c:145
  #26 main () at simpleapi/inref/randomWalk.c:93

  (gdb) f 6
  #6  insert_region (reg=0x14d0170, reg_list=0x7ff49179f158 <tp_reg_list>, reg_free_list=0x7ff49179f078 <tp_reg_free_list>, size=40) at sr_port/insert_region.c:110
  110                                     assert((CDB_STAGNATE > t_tries) || (dollar_tlevel && csa->now_crit));

  (gdb) p process_exiting
  $3 = 1

  (gdb) p t_tries
  $4 = 3

  (gdb) p dollar_tlevel
  $5 = 1

  (gdb) p csa->now_crit
  $6 = 0

  (gdb) up
  #16 mlk_lock (p=0x1591940, auxown=0, new=1) at sr_port/mlk_lock.c:120
  120                             TPNOTACID_CHECK(LOCKGCINTP);
  ```

Issue
-----
* The assert that failed in `insert_region()` (frame 6 in above stack trace) indicates that we were in the
  final retry (i.e. `t_tries` is equal to `3` or `CDB_STAGNATE`) but we did not hold crit on the current
  region where we are trying to do an `mlk_unlock()` operation.

* The assert is valid and did expose an issue.

* In frame 16, in `mlk_lock()`, we did a `rel_crit()` call in the `TPNOTACID_CHECK` macro while in the
  final retry.

  **sr_port/mlk_lock.c**
  ```c
    120                         TPNOTACID_CHECK(LOCKGCINTP);
  ```

* Below is the code inside the macro.

  **sr_port/tp.h**
  ```c
     979 #define TPNOTACID_CHECK(CALLER_STR)                                                                                             \
     980 {                                                                                                                               \
     981         GBLREF  boolean_t       mupip_jnl_recover;                                                                              \
     982         mval            zpos;                                                                                                   \
     983                                                                                                                                 \
     984         if (IS_TP_AND_FINAL_RETRY)                                                                                              \
     985         {                                                                                                                       \
  -> 986                 TP_REL_CRIT_ALL_REG;                                                                                            \
     987                 assert(!mupip_jnl_recover);                                                                                     \
     988                 TP_FINAL_RETRY_DECREMENT_T_TRIES_IF_OK;                                                                         \
  ```

* Line 986 is where the issue is. We do a `rel_crit()` call there but `t_tries` is still not decremented.
  The decrement of `t_tries` happens 2 lines later at line 988.

* Before doing the `rel_crit()` call, we need to decrement `t_tries`. This way, in case `rel_crit()`
  decides to invoke exit handling due to handling a deferred SIGINT signal (sent in the `ydb464` subtest),
  the assert in `insert_region()` would not be confused by seeing this out-of-design state and will not
  attempt to invoke `t_retry()` etc. which is a no-no as we should not transfer control to M code as
  part of a TP restart while the process is about to terminate on receipt of a SIGINT signal.

Fix
---
* Notice that in `sr_port/t_commit_cleanup.c`, the `t_tries` decrement happens BEFORE the `rel_crit()`
  call.

  **sr_port/t_commit_cleanup.c**
  ```c
    288       if (CDB_STAGNATE <= t_tries)
    289               TP_FINAL_RETRY_DECREMENT_T_TRIES_IF_OK; /* t_tries untouched for rollback and recover */
      .
      .
    303               if (!csa->hold_onto_crit && csa->now_crit)
    304                       rel_crit(tr->reg);      /* Undo Step (CMT01) */
  ```

* In a similar fashion, in the `TPNOTACID_CHECK` macro in `sr_port/tp.h`, the `TP_REL_CRIT_ALL_REG` call
  should happen AFTER the `TP_FINAL_RETRY_DECREMENT_T_TRIES_IF_OK` call. And that is the fix.

* While doing this fix, I noticed a similar ordering issue in `sr_port/gvcst_init.c` and so fixed that too.

Notes
-----
* While this failure happened with a Debug build of YottaDB, I suspect there is an issue in the Release
  build of YottaDB too. But not sure exactly what the user-visible implications are. Even if so, it is
  likely to be not encountered in practice and so no user-visible issue is created for this.
  • Loading branch information
nars1 committed Nov 15, 2023
1 parent d366c66 commit 156c110
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 5 deletions.
8 changes: 5 additions & 3 deletions sr_port/gvcst_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2001-2021 Fidelity National Information *
* Services, Inc. and/or its subsidiaries. All rights reserved. *
* *
* Copyright (c) 2017-2022 YottaDB LLC and/or its subsidiaries. *
* Copyright (c) 2017-2023 YottaDB LLC and/or its subsidiaries. *
* All rights reserved. *
* *
* This source code contains the intellectual property *
Expand Down Expand Up @@ -442,10 +442,12 @@ void gvcst_init(gd_region *reg)
* as it would result in we later starting the final retry with t_tries = 2 but
* holding crit on all regions which is an out-of-design situation.
*/
assert(!mupip_jnl_recover);
TP_FINAL_RETRY_DECREMENT_T_TRIES_IF_OK; /* Note this needs to be called BEFORE TP_REL_CRIT_ALL_REG.
* See comment in TPNOTACID_CHECK macro for why.
*/
TP_REL_CRIT_ALL_REG;
assert(!baseDBcsa->now_crit);
assert(!mupip_jnl_recover);
TP_FINAL_RETRY_DECREMENT_T_TRIES_IF_OK;
}
db_init_region = baseDBreg; /* gvcst_init_autoDB_ch needs reg 4 ftok_sem_lock */
DEFER_INTERRUPTS(INTRPT_IN_GVCST_INIT, prev_intrpt_state);
Expand Down
9 changes: 7 additions & 2 deletions sr_port/tp.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2001-2021 Fidelity National Information *
* Services, Inc. and/or its subsidiaries. All rights reserved. *
* *
* Copyright (c) 2017-2021 YottaDB LLC and/or its subsidiaries. *
* Copyright (c) 2017-2023 YottaDB LLC and/or its subsidiaries. *
* All rights reserved. *
* *
* This source code contains the intellectual property *
Expand Down Expand Up @@ -983,9 +983,14 @@ GBLREF unsigned int t_tries;
\
if (IS_TP_AND_FINAL_RETRY) \
{ \
TP_REL_CRIT_ALL_REG; \
assert(!mupip_jnl_recover); \
/* Note: The TP_REL_CRIT_ALL_REG call needs to happen AFTER the TP_FINAL_RETRY_DECREMENT_T_TRIES_IF_OK call \
* as otherwise it is possible a SIGTERM/SIGINT/SIGQUIT signal gets handled as part of the TP_REL_CRIT_ALL_REG \
* call taking us to exit processing while "t_tries" is still set to "CDB_STAGNATE" which would confuse other \
* code as we are effectively in the final retry but not holding crit on a region inside a TP transaction. \
*/ \
TP_FINAL_RETRY_DECREMENT_T_TRIES_IF_OK; \
TP_REL_CRIT_ALL_REG; \
getzposition(&zpos); \
send_msg_csa(CSA_ARG(NULL) VARLSTCNT(9) ERR_TPNOTACID, 7, LEN_AND_LIT(CALLER_STR), zpos.str.len, zpos.str.addr, \
(TREF(tpnotacidtime)).str.len, (TREF(tpnotacidtime)).str.addr, dollar_trestart); \
Expand Down

0 comments on commit 156c110

Please sign in to comment.