Skip to content

Commit

Permalink
Removing generic linux calls and mmap cleanup
Browse files Browse the repository at this point in the history
We were using the generic Linux calls to make sure that the page cache
was cleaned out before issuing any Direct I/O reads or writes. However,
this only matters in the event the file region being written/read from
using O_DIRECT was mmap'ed. One of stipulations with O_DIRECT is that is
redirected through the ARC in the event the file range is mmap'ed.
Becaues of this, it did not make sense to try and invalidate the page
cache if we were never intending to have O_DIRECT to work with mmap'ed
regions. Also, calls into the generic Linux calls in writes would often
lead to lockups as the page lock is dropped in zfs_putpage(). See the
stack dump below. In order to just prevent this, we no longer will use
the generic linux direct IO wrappers or try and flush out the page
cache.

Instead if we find the file range has been mmap'ed in since the initial
check in zfs_setup_direct() we will just now directly handle that in
zfs_read() and zfs_write(). In most case zfs_setup_direct() will prevent
O_DIRECT to mmap'ed regions of the file that have been page faulted in,
but if that happen when we are issuing the direct I/O request the the
normal parts of the ZFS paths will be taken to account for this.

It is highly suggested not to mmap a region of file and then write or
read directly to the file. In general, that is kind of an isane thing to
do... However, we try our best to still have consistency with the ARC.

Also, before making this decision I did explore if we could just add a
rangelock in zfs_fillpage(), but we can not do that. The reason is when
the page is in zfs_readpage_common() it has already been locked by the
kernel. So, if we try and grab the rangelock anywhere in that path we
can get stuck if another thread is issuing writes to the file region
that was mmap'ed in. The reason is update_pages() holds the rangelock
and then tries to lock the page. In this case zfs_fillpage() holds the
page lock but is stuck in the rangelock waiting and holding the page
lock. Deadlock is unavoidable in this case.

[260136.244332] INFO: task fio:3791107 blocked for more than 120
seconds.
[260136.250867]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260136.258693] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260136.266607] task:fio             state:D stack:    0 pid:3791107
ppid:3790841 flags:0x00004080
[260136.275306] Call Trace:
[260136.277845]  __schedule+0x2d1/0x830
[260136.281432]  schedule+0x35/0xa0
[260136.284665]  io_schedule+0x12/0x40
[260136.288157]  wait_on_page_bit+0x123/0x220
[260136.292258]  ? xas_load+0x8/0x80
[260136.295577]  ? file_fdatawait_range+0x20/0x20
[260136.300024]  filemap_page_mkwrite+0x9b/0xb0
[260136.304295]  do_page_mkwrite+0x53/0x90
[260136.308135]  ? vm_normal_page+0x1a/0xc0
[260136.312062]  do_wp_page+0x298/0x350
[260136.315640]  __handle_mm_fault+0x44f/0x6c0
[260136.319826]  ? __switch_to_asm+0x41/0x70
[260136.323839]  handle_mm_fault+0xc1/0x1e0
[260136.327766]  do_user_addr_fault+0x1b5/0x440
[260136.332038]  do_page_fault+0x37/0x130
[260136.335792]  ? page_fault+0x8/0x30
[260136.339284]  page_fault+0x1e/0x30
[260136.342689] RIP: 0033:0x7f6deee7f1b4
[260136.346361] Code: Unable to access opcode bytes at RIP
0x7f6deee7f18a.
[260136.352977] RSP: 002b:00007fffe41b6538 EFLAGS: 00010202
[260136.358288] RAX: 00007f6d83049000 RBX: 0000556b63614ec0 RCX:
00007f6d83148fe0
[260136.365508] RDX: 00000000000acfe0 RSI: 00007f6d84e9c030 RDI:
00007f6d8309bfa0
[260136.372730] RBP: 00007f6d84f4a000 R08: ffffffffffffffe0 R09:
0000000000000000
[260136.379946] R10: 00007f6d84f8e810 R11: 00007f6d83049000 R12:
0000000000000001
[260136.387167] R13: 0000556b63614ec0 R14: 0000000000100000 R15:
0000556b63614ee8
[260136.394387] INFO: task fio:3791108 blocked for more than 120
seconds.
[260136.400911]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260136.408739] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260136.416651] task:fio             state:D stack:    0 pid:3791108
ppid:3790835 flags:0x00004080
[260136.425343] Call Trace:
[260136.427883]  __schedule+0x2d1/0x830
[260136.431463]  ? cv_wait_common+0x12d/0x240 [spl]
[260136.436091]  schedule+0x35/0xa0
[260136.439321]  io_schedule+0x12/0x40
[260136.442814]  __lock_page+0x12d/0x230
[260136.446483]  ? file_fdatawait_range+0x20/0x20
[260136.450929]  zfs_putpage+0x148/0x590 [zfs]
[260136.455322]  ? rmap_walk_file+0x116/0x290
[260136.459421]  ? __mod_memcg_lruvec_state+0x5d/0x160
[260136.464300]  zpl_putpage+0x67/0xd0 [zfs]
[260136.468495]  write_cache_pages+0x197/0x420
[260136.472679]  ? zpl_readpage_filler+0x10/0x10 [zfs]
[260136.477732]  zpl_writepages+0x119/0x130 [zfs]
[260136.482352]  do_writepages+0xc2/0x1c0
[260136.486103]  ? flush_tlb_func_common.constprop.9+0x125/0x220
[260136.491850]  __filemap_fdatawrite_range+0xc7/0x100
[260136.496732]  filemap_write_and_wait_range+0x30/0x80
[260136.501695]  generic_file_direct_write+0x120/0x160
[260136.506575]  ? rrw_exit+0xb0/0x1c0 [zfs]
[260136.510779]  zpl_iter_write+0xdd/0x160 [zfs]
[260136.515323]  new_sync_write+0x112/0x160
[260136.519255]  vfs_write+0xa5/0x1a0
[260136.522662]  ksys_write+0x4f/0xb0
[260136.526067]  do_syscall_64+0x5b/0x1a0
[260136.529818]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[260136.534959] RIP: 0033:0x7f9d192c7a17
[260136.538625] Code: Unable to access opcode bytes at RIP
0x7f9d192c79ed.
[260136.545236] RSP: 002b:00007ffc8e4ba270 EFLAGS: 00000293 ORIG_RAX:
0000000000000001
[260136.552889] RAX: ffffffffffffffda RBX: 0000000000000005 RCX:
00007f9d192c7a17
[260136.560108] RDX: 0000000000100000 RSI: 00007f9caea03000 RDI:
0000000000000005
[260136.567329] RBP: 00007f9caea03000 R08: 0000000000000000 R09:
0000000000000000
[260136.574548] R10: 00005558e8975680 R11: 0000000000000293 R12:
0000000000100000
[260136.581767] R13: 00005558e8985ec0 R14: 0000000000100000 R15:
00005558e8985ee8
[260136.588989] INFO: task fio:3791109 blocked for more than 120
seconds.
[260136.595513]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260136.603337] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260136.611250] task:fio             state:D stack:    0 pid:3791109
ppid:3790838 flags:0x00004080
[260136.619943] Call Trace:
[260136.622483]  __schedule+0x2d1/0x830
[260136.626064]  ? zfs_znode_held+0xe6/0x140 [zfs]
[260136.630777]  schedule+0x35/0xa0
[260136.634009]  cv_wait_common+0x153/0x240 [spl]
[260136.638466]  ? finish_wait+0x80/0x80
[260136.642129]  zfs_rangelock_enter_reader+0xa1/0x1f0 [zfs]
[260136.647712]  zfs_rangelock_enter_impl+0xbf/0x170 [zfs]
[260136.653121]  zfs_get_data+0x113/0x770 [zfs]
[260136.657567]  zil_lwb_commit+0x537/0x780 [zfs]
[260136.662187]  zil_process_commit_list+0x14c/0x460 [zfs]
[260136.667585]  zil_commit_writer+0xeb/0x160 [zfs]
[260136.672376]  zil_commit_impl+0x5d/0xa0 [zfs]
[260136.676910]  zfs_putpage+0x516/0x590 [zfs]
[260136.681279]  zpl_putpage+0x67/0xd0 [zfs]
[260136.685467]  write_cache_pages+0x197/0x420
[260136.689649]  ? zpl_readpage_filler+0x10/0x10 [zfs]
[260136.694705]  zpl_writepages+0x119/0x130 [zfs]
[260136.699322]  do_writepages+0xc2/0x1c0
[260136.703076]  __filemap_fdatawrite_range+0xc7/0x100
[260136.707952]  filemap_write_and_wait_range+0x30/0x80
[260136.712920]  zpl_iter_read_direct+0x86/0x1b0 [zfs]
[260136.717972]  ? rrw_exit+0xb0/0x1c0 [zfs]
[260136.722174]  zpl_iter_read+0x90/0xb0 [zfs]
[260136.726536]  new_sync_read+0x10f/0x150
[260136.730376]  vfs_read+0x91/0x140
[260136.733693]  ksys_read+0x4f/0xb0
[260136.737012]  do_syscall_64+0x5b/0x1a0
[260136.740764]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[260136.745906] RIP: 0033:0x7f1bd4687ab4
[260136.749574] Code: Unable to access opcode bytes at RIP
0x7f1bd4687a8a.
[260136.756181] RSP: 002b:00007fff63f65170 EFLAGS: 00000246 ORIG_RAX:
0000000000000000
[260136.763834] RAX: ffffffffffffffda RBX: 0000000000000005 RCX:
00007f1bd4687ab4
[260136.771056] RDX: 0000000000100000 RSI: 00007f1b69dc3000 RDI:
0000000000000005
[260136.778274] RBP: 00007f1b69dc3000 R08: 0000000000000000 R09:
0000000000000000
[260136.785494] R10: 000000008fd0ea42 R11: 0000000000000246 R12:
0000000000100000
[260136.792714] R13: 000055ca4b405ec0 R14: 0000000000100000 R15:
000055ca4b405ee8
[260259.123003] INFO: task kworker/u128:0:3589938 blocked for more than
120 seconds.
[260259.130487]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260259.138313] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260259.146224] task:kworker/u128:0  state:D stack:    0 pid:3589938
ppid:     2 flags:0x80004080
[260259.154832] Workqueue: writeback wb_workfn (flush-zfs-540)
[260259.160411] Call Trace:
[260259.162950]  __schedule+0x2d1/0x830
[260259.166531]  schedule+0x35/0xa0
[260259.169765]  io_schedule+0x12/0x40
[260259.173257]  __lock_page+0x12d/0x230
[260259.176921]  ? file_fdatawait_range+0x20/0x20
[260259.181368]  write_cache_pages+0x1f2/0x420
[260259.185554]  ? zpl_readpage_filler+0x10/0x10 [zfs]
[260259.190633]  zpl_writepages+0x98/0x130 [zfs]
[260259.195183]  do_writepages+0xc2/0x1c0
[260259.198935]  __writeback_single_inode+0x39/0x2f0
[260259.203640]  writeback_sb_inodes+0x1e6/0x450
[260259.208002]  __writeback_inodes_wb+0x5f/0xc0
[260259.212359]  wb_writeback+0x247/0x2e0
[260259.216114]  ? get_nr_inodes+0x35/0x50
[260259.219953]  wb_workfn+0x37c/0x4d0
[260259.223443]  ? __switch_to_asm+0x35/0x70
[260259.227456]  ? __switch_to_asm+0x41/0x70
[260259.231469]  ? __switch_to_asm+0x35/0x70
[260259.235481]  ? __switch_to_asm+0x41/0x70
[260259.239495]  ? __switch_to_asm+0x35/0x70
[260259.243505]  ? __switch_to_asm+0x41/0x70
[260259.247518]  ? __switch_to_asm+0x35/0x70
[260259.251533]  ? __switch_to_asm+0x41/0x70
[260259.255545]  process_one_work+0x1a7/0x360
[260259.259645]  worker_thread+0x30/0x390
[260259.263396]  ? create_worker+0x1a0/0x1a0
[260259.267409]  kthread+0x10a/0x120
[260259.270730]  ? set_kthread_struct+0x40/0x40
[260259.275003]  ret_from_fork+0x35/0x40
[260259.278712] INFO: task fio:3791107 blocked for more than 120
seconds.
[260259.285240]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260259.293064] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260259.300976] task:fio             state:D stack:    0 pid:3791107
ppid:3790841 flags:0x00004080
[260259.309668] Call Trace:
[260259.312210]  __schedule+0x2d1/0x830
[260259.315787]  schedule+0x35/0xa0
[260259.319020]  io_schedule+0x12/0x40
[260259.322511]  wait_on_page_bit+0x123/0x220
[260259.326611]  ? xas_load+0x8/0x80
[260259.329930]  ? file_fdatawait_range+0x20/0x20
[260259.334376]  filemap_page_mkwrite+0x9b/0xb0
[260259.338650]  do_page_mkwrite+0x53/0x90
[260259.342489]  ? vm_normal_page+0x1a/0xc0
[260259.346415]  do_wp_page+0x298/0x350
[260259.349994]  __handle_mm_fault+0x44f/0x6c0
[260259.354181]  ? __switch_to_asm+0x41/0x70
[260259.358193]  handle_mm_fault+0xc1/0x1e0
[260259.362117]  do_user_addr_fault+0x1b5/0x440
[260259.366391]  do_page_fault+0x37/0x130
[260259.370145]  ? page_fault+0x8/0x30
[260259.373639]  page_fault+0x1e/0x30
[260259.377043] RIP: 0033:0x7f6deee7f1b4
[260259.380714] Code: Unable to access opcode bytes at RIP
0x7f6deee7f18a.
[260259.387323] RSP: 002b:00007fffe41b6538 EFLAGS: 00010202
[260259.392633] RAX: 00007f6d83049000 RBX: 0000556b63614ec0 RCX:
00007f6d83148fe0
[260259.399853] RDX: 00000000000acfe0 RSI: 00007f6d84e9c030 RDI:
00007f6d8309bfa0
[260259.407074] RBP: 00007f6d84f4a000 R08: ffffffffffffffe0 R09:
0000000000000000
[260259.414291] R10: 00007f6d84f8e810 R11: 00007f6d83049000 R12:
0000000000000001
[260259.421512] R13: 0000556b63614ec0 R14: 0000000000100000 R15:
0000556b63614ee8
[260259.428733] INFO: task fio:3791108 blocked for more than 120
seconds.
[260259.435258]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260259.443085] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260259.450997] task:fio             state:D stack:    0 pid:3791108
ppid:3790835 flags:0x00004080
[260259.459689] Call Trace:
[260259.462228]  __schedule+0x2d1/0x830
[260259.465808]  ? cv_wait_common+0x12d/0x240 [spl]
[260259.470435]  schedule+0x35/0xa0
[260259.473669]  io_schedule+0x12/0x40
[260259.477161]  __lock_page+0x12d/0x230
[260259.480828]  ? file_fdatawait_range+0x20/0x20
[260259.485274]  zfs_putpage+0x148/0x590 [zfs]
[260259.489640]  ? rmap_walk_file+0x116/0x290
[260259.493742]  ? __mod_memcg_lruvec_state+0x5d/0x160
[260259.498619]  zpl_putpage+0x67/0xd0 [zfs]
[260259.502813]  write_cache_pages+0x197/0x420
[260259.506998]  ? zpl_readpage_filler+0x10/0x10 [zfs]
[260259.512054]  zpl_writepages+0x119/0x130 [zfs]
[260259.516672]  do_writepages+0xc2/0x1c0
[260259.520423]  ? flush_tlb_func_common.constprop.9+0x125/0x220
[260259.526170]  __filemap_fdatawrite_range+0xc7/0x100
[260259.531050]  filemap_write_and_wait_range+0x30/0x80
[260259.536016]  generic_file_direct_write+0x120/0x160
[260259.540896]  ? rrw_exit+0xb0/0x1c0 [zfs]
[260259.545099]  zpl_iter_write+0xdd/0x160 [zfs]
[260259.549639]  new_sync_write+0x112/0x160
[260259.553566]  vfs_write+0xa5/0x1a0
[260259.556971]  ksys_write+0x4f/0xb0
[260259.560379]  do_syscall_64+0x5b/0x1a0
[260259.564131]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[260259.569269] RIP: 0033:0x7f9d192c7a17
[260259.572935] Code: Unable to access opcode bytes at RIP
0x7f9d192c79ed.
[260259.579549] RSP: 002b:00007ffc8e4ba270 EFLAGS: 00000293 ORIG_RAX:
0000000000000001
[260259.587200] RAX: ffffffffffffffda RBX: 0000000000000005 RCX:
00007f9d192c7a17
[260259.594419] RDX: 0000000000100000 RSI: 00007f9caea03000 RDI:
0000000000000005
[260259.601639] RBP: 00007f9caea03000 R08: 0000000000000000 R09:
0000000000000000
[260259.608859] R10: 00005558e8975680 R11: 0000000000000293 R12:
0000000000100000
[260259.616078] R13: 00005558e8985ec0 R14: 0000000000100000 R15:
00005558e8985ee8
[260259.623298] INFO: task fio:3791109 blocked for more than 120
seconds.
[260259.629827]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260259.637650] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260259.645564] task:fio             state:D stack:    0 pid:3791109
ppid:3790838 flags:0x00004080
[260259.654254] Call Trace:
[260259.656794]  __schedule+0x2d1/0x830
[260259.660373]  ? zfs_znode_held+0xe6/0x140 [zfs]
[260259.665081]  schedule+0x35/0xa0
[260259.668313]  cv_wait_common+0x153/0x240 [spl]
[260259.672768]  ? finish_wait+0x80/0x80
[260259.676441]  zfs_rangelock_enter_reader+0xa1/0x1f0 [zfs]
[260259.682026]  zfs_rangelock_enter_impl+0xbf/0x170 [zfs]
[260259.687432]  zfs_get_data+0x113/0x770 [zfs]
[260259.691876]  zil_lwb_commit+0x537/0x780 [zfs]
[260259.696497]  zil_process_commit_list+0x14c/0x460 [zfs]
[260259.701895]  zil_commit_writer+0xeb/0x160 [zfs]
[260259.706689]  zil_commit_impl+0x5d/0xa0 [zfs]
[260259.711228]  zfs_putpage+0x516/0x590 [zfs]
[260259.715589]  zpl_putpage+0x67/0xd0 [zfs]
[260259.719775]  write_cache_pages+0x197/0x420
[260259.723959]  ? zpl_readpage_filler+0x10/0x10 [zfs]
[260259.729013]  zpl_writepages+0x119/0x130 [zfs]
[260259.733632]  do_writepages+0xc2/0x1c0
[260259.737384]  __filemap_fdatawrite_range+0xc7/0x100
[260259.742264]  filemap_write_and_wait_range+0x30/0x80
[260259.747229]  zpl_iter_read_direct+0x86/0x1b0 [zfs]
[260259.752286]  ? rrw_exit+0xb0/0x1c0 [zfs]
[260259.756487]  zpl_iter_read+0x90/0xb0 [zfs]
[260259.760855]  new_sync_read+0x10f/0x150
[260259.764696]  vfs_read+0x91/0x140
[260259.768013]  ksys_read+0x4f/0xb0
[260259.771332]  do_syscall_64+0x5b/0x1a0
[260259.775087]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[260259.780225] RIP: 0033:0x7f1bd4687ab4
[260259.783893] Code: Unable to access opcode bytes at RIP
0x7f1bd4687a8a.
[260259.790503] RSP: 002b:00007fff63f65170 EFLAGS: 00000246 ORIG_RAX:
0000000000000000
[260259.798157] RAX: ffffffffffffffda RBX: 0000000000000005 RCX:
00007f1bd4687ab4
[260259.805377] RDX: 0000000000100000 RSI: 00007f1b69dc3000 RDI:
0000000000000005
[260259.812592] RBP: 00007f1b69dc3000 R08: 0000000000000000 R09:
0000000000000000
[260259.819814] R10: 000000008fd0ea42 R11: 0000000000000246 R12:
0000000000100000
[260259.827032] R13: 000055ca4b405ec0 R14: 0000000000100000 R15:
000055ca4b405ee8
[260382.001731] INFO: task kworker/u128:0:3589938 blocked for more than
120 seconds.
[260382.009227]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260382.017053] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260382.024963] task:kworker/u128:0  state:D stack:    0 pid:3589938
ppid:     2 flags:0x80004080
[260382.033568] Workqueue: writeback wb_workfn (flush-zfs-540)
[260382.039141] Call Trace:
[260382.041683]  __schedule+0x2d1/0x830
[260382.045271]  schedule+0x35/0xa0
[260382.048503]  io_schedule+0x12/0x40
[260382.051994]  __lock_page+0x12d/0x230
[260382.055662]  ? file_fdatawait_range+0x20/0x20
[260382.060107]  write_cache_pages+0x1f2/0x420
[260382.064293]  ? zpl_readpage_filler+0x10/0x10 [zfs]
[260382.069379]  zpl_writepages+0x98/0x130 [zfs]
[260382.073919]  do_writepages+0xc2/0x1c0
[260382.077672]  __writeback_single_inode+0x39/0x2f0
[260382.082379]  writeback_sb_inodes+0x1e6/0x450
[260382.086738]  __writeback_inodes_wb+0x5f/0xc0
[260382.091097]  wb_writeback+0x247/0x2e0
[260382.094850]  ? get_nr_inodes+0x35/0x50
[260382.098689]  wb_workfn+0x37c/0x4d0
[260382.102181]  ? __switch_to_asm+0x35/0x70
[260382.106194]  ? __switch_to_asm+0x41/0x70
[260382.110207]  ? __switch_to_asm+0x35/0x70
[260382.114221]  ? __switch_to_asm+0x41/0x70
[260382.118231]  ? __switch_to_asm+0x35/0x70
[260382.122244]  ? __switch_to_asm+0x41/0x70
[260382.126256]  ? __switch_to_asm+0x35/0x70
[260382.130273]  ? __switch_to_asm+0x41/0x70
[260382.134284]  process_one_work+0x1a7/0x360
[260382.138384]  worker_thread+0x30/0x390
[260382.142136]  ? create_worker+0x1a0/0x1a0
[260382.146150]  kthread+0x10a/0x120
[260382.149469]  ? set_kthread_struct+0x40/0x40
[260382.153741]  ret_from_fork+0x35/0x40
[260382.157448] INFO: task fio:3791107 blocked for more than 120
seconds.
[260382.163977]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260382.171802] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260382.179715] task:fio             state:D stack:    0 pid:3791107
ppid:3790841 flags:0x00004080
[260382.188409] Call Trace:
[260382.190945]  __schedule+0x2d1/0x830
[260382.194527]  schedule+0x35/0xa0
[260382.197757]  io_schedule+0x12/0x40
[260382.201249]  wait_on_page_bit+0x123/0x220
[260382.205350]  ? xas_load+0x8/0x80
[260382.208668]  ? file_fdatawait_range+0x20/0x20
[260382.213114]  filemap_page_mkwrite+0x9b/0xb0
[260382.217386]  do_page_mkwrite+0x53/0x90
[260382.221227]  ? vm_normal_page+0x1a/0xc0
[260382.225152]  do_wp_page+0x298/0x350
[260382.228733]  __handle_mm_fault+0x44f/0x6c0
[260382.232919]  ? __switch_to_asm+0x41/0x70
[260382.236930]  handle_mm_fault+0xc1/0x1e0
[260382.240856]  do_user_addr_fault+0x1b5/0x440
[260382.245132]  do_page_fault+0x37/0x130
[260382.248883]  ? page_fault+0x8/0x30
[260382.252375]  page_fault+0x1e/0x30
[260382.255781] RIP: 0033:0x7f6deee7f1b4
[260382.259451] Code: Unable to access opcode bytes at RIP
0x7f6deee7f18a.
[260382.266059] RSP: 002b:00007fffe41b6538 EFLAGS: 00010202
[260382.271373] RAX: 00007f6d83049000 RBX: 0000556b63614ec0 RCX:
00007f6d83148fe0
[260382.278591] RDX: 00000000000acfe0 RSI: 00007f6d84e9c030 RDI:
00007f6d8309bfa0
[260382.285813] RBP: 00007f6d84f4a000 R08: ffffffffffffffe0 R09:
0000000000000000
[260382.293030] R10: 00007f6d84f8e810 R11: 00007f6d83049000 R12:
0000000000000001
[260382.300249] R13: 0000556b63614ec0 R14: 0000000000100000 R15:
0000556b63614ee8
[260382.307472] INFO: task fio:3791108 blocked for more than 120
seconds.
[260382.313997]       Tainted: P           OE    --------- -  -
4.18.0-408.el8.x86_64 openzfs#1
[260382.321823] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[260382.329734] task:fio             state:D stack:    0 pid:3791108
ppid:3790835 flags:0x00004080
[260382.338427] Call Trace:
[260382.340967]  __schedule+0x2d1/0x830
[260382.344547]  ? cv_wait_common+0x12d/0x240 [spl]
[260382.349173]  schedule+0x35/0xa0
[260382.352406]  io_schedule+0x12/0x40
[260382.355899]  __lock_page+0x12d/0x230
[260382.359563]  ? file_fdatawait_range+0x20/0x20
[260382.364010]  zfs_putpage+0x148/0x590 [zfs]
[260382.368379]  ? rmap_walk_file+0x116/0x290
[260382.372479]  ? __mod_memcg_lruvec_state+0x5d/0x160
[260382.377358]  zpl_putpage+0x67/0xd0 [zfs]
[260382.381552]  write_cache_pages+0x197/0x420
[260382.385739]  ? zpl_readpage_filler+0x10/0x10 [zfs]
[260382.390791]  zpl_writepages+0x119/0x130 [zfs]
[260382.395410]  do_writepages+0xc2/0x1c0
[260382.399161]  ? flush_tlb_func_common.constprop.9+0x125/0x220
[260382.404907]  __filemap_fdatawrite_range+0xc7/0x100
[260382.409790]  filemap_write_and_wait_range+0x30/0x80
[260382.414752]  generic_file_direct_write+0x120/0x160
[260382.419632]  ? rrw_exit+0xb0/0x1c0 [zfs]
[260382.423838]  zpl_iter_write+0xdd/0x160 [zfs]
[260382.428379]  new_sync_write+0x112/0x160
[260382.432304]  vfs_write+0xa5/0x1a0
[260382.435711]  ksys_write+0x4f/0xb0
[260382.439115]  do_syscall_64+0x5b/0x1a0
[260382.442866]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[260382.448007] RIP: 0033:0x7f9d192c7a17
[260382.451675] Code: Unable to access opcode bytes at RIP
0x7f9d192c79ed.
[260382.458286] RSP: 002b:00007ffc8e4ba270 EFLAGS: 00000293 ORIG_RAX:
0000000000000001
[260382.465938] RAX: ffffffffffffffda RBX: 0000000000000005 RCX:
00007f9d192c7a17
[260382.473158] RDX: 0000000000100000 RSI: 00007f9caea03000 RDI:
0000000000000005
[260382.480379] RBP: 00007f9caea03000 R08: 0000000000000000 R09:
0000000000000000
[260382.487597] R10: 00005558e8975680 R11: 0000000000000293 R12:
0000000000100000
[260382.494814] R13: 00005558e8985ec0 R14: 0000000000100000 R15:
00005558e8985ee8

Signed-off-by: Brian Atkinson <[email protected]>
  • Loading branch information
bwatkinson committed Sep 12, 2024
1 parent 0877195 commit 1d585f8
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 245 deletions.
120 changes: 0 additions & 120 deletions config/kernel-generic_file_direct_write.m4

This file was deleted.

2 changes: 0 additions & 2 deletions config/kernel.m4
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_GENHD_FLAGS
ZFS_AC_KERNEL_SRC_REVALIDATE_DISK
ZFS_AC_KERNEL_SRC_GET_DISK_RO
ZFS_AC_KERNEL_SRC_GENERIC_FILE_DIRECT_WRITE
ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL
ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY
ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE
Expand Down Expand Up @@ -225,7 +224,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_GENHD_FLAGS
ZFS_AC_KERNEL_REVALIDATE_DISK
ZFS_AC_KERNEL_GET_DISK_RO
ZFS_AC_KERNEL_GENERIC_FILE_DIRECT_WRITE
ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL
ZFS_AC_KERNEL_DISCARD_GRANULARITY
ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE
Expand Down
27 changes: 0 additions & 27 deletions include/os/linux/zfs/sys/zpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,31 +293,4 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg);
#define zpl_inode_set_mtime_to_ts(ip, ts) (ip->i_mtime = ts)
#endif

/*
* HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER* align with HAVE_VFS_RW_ITERATE
*/
#if defined(HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER)
/* 4.7 API */
#define zpl_generic_file_direct_write(iocb, iter, off) \
generic_file_direct_write(iocb, iter)

#elif defined(HAVE_GENERIC_FILE_DIRECT_WRITE_IOV_ITER_WITH_LOFF)
/* 3.16 API */
#define zpl_generic_file_direct_write(iocb, iter, off) \
generic_file_direct_write(iocb, iter, off)

#elif defined(HAVE_GENERIC_FILE_DIRECT_WRITE_IOVEC)
/* 3.15 API */
#define zpl_generic_file_direct_write(iocb, vec, segs, pos, ppos, cnt, ocnt) \
generic_file_direct_write(iocb, vec, segs, pos, cnt, ocnt)

#elif defined(HAVE_GENERIC_FILE_DIRECT_WRITE_IOVEC_LOFF_PTR)
/* 3.10 API */
#define zpl_generic_file_direct_write(iocb, vec, segs, pos, ppos, cnt, ocnt) \
generic_file_direct_write(iocb, vec, segs, pos, ppos, cnt, ocnt)

#else
#error "Unsupported kernel"
#endif

#endif /* _SYS_ZPL_H */
95 changes: 22 additions & 73 deletions module/os/linux/zfs/zpl_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -351,16 +351,6 @@ zpl_iter_read_direct(struct kiocb *kiocb, struct iov_iter *to)
zfs_uio_t uio;
ssize_t ret;

/*
* Attempt to flush out any pages from the page cache. On error
* fallback to the buffered path.
*/
ret = filemap_write_and_wait_range(filp->f_mapping, kiocb->ki_pos,
kiocb->ki_pos + count - 1);

if (ret < 0)
return (ret);

zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0);

/* On error, return to fallback to the buffered path. */
Expand Down Expand Up @@ -508,6 +498,7 @@ zpl_iter_write_direct(struct kiocb *kiocb, struct iov_iter *from)
return (error);

wrote = count - uio.uio_resid;
kiocb->ki_pos += wrote;

return (wrote);
}
Expand All @@ -533,22 +524,9 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
if (direct == ZFS_DIRECT_IO_ERR) {
return (-error);
} else if (direct == ZFS_DIRECT_IO_ENABLED) {
/*
* zpl_generic_file_direct_write() will attempt to flush out any
* pages in the page cache and invalidate them. If this is
* successful it will cal the direct_IO
* address_space_operation (zpl_iter_write_direct()).
*/
ssize_t wrote = zpl_generic_file_direct_write(kiocb, from,
kiocb->ki_pos);
ssize_t wrote = zpl_iter_write_direct(kiocb, from);

if (wrote >= 0 || wrote != -EAGAIN) {
/*
* generic_file_direct_write() will update
* kiocb->ki_pos on a successful Direct IO write.
*/
IMPLY(wrote >= 0,
(offset + wrote) == kiocb->ki_pos);
return (wrote);
}

Expand Down Expand Up @@ -619,16 +597,6 @@ zpl_aio_read_direct(struct kiocb *kiocb, const struct iovec *iov,
if (ret)
return (ret);

/*
* Attempt to flush out any pages from the page cache. On error
* fallback to the buffered path.
*/
ret = filemap_write_and_wait_range(filp->f_mapping, kiocb->ki_pos,
kiocb->ki_pos + iov_length(iov, nr_segs) - 1);

if (ret < 0)
return (ret);

zfs_uio_t uio;
zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE,
count, 0);
Expand Down Expand Up @@ -771,6 +739,7 @@ zpl_aio_write_direct(struct kiocb *kiocb, const struct iovec *iov,
return (error);

ssize_t wrote = count - uio.uio_resid;
kiocb->ki_pos += wrote;

return (wrote);
}
Expand Down Expand Up @@ -805,21 +774,9 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,
if (direct == ZFS_DIRECT_IO_ERR) {
return (-error);
} else if (direct == ZFS_DIRECT_IO_ENABLED) {
/*
* zpl_generic_file_direct_write() will attempt to flush out any
* pages in the page cahce and invalidate them. If this is
* successful it will call the direct_IO
* address_space_operation (zpl_aio_write_direct()).
*/
ssize_t wrote = zpl_generic_file_direct_write(kiocb, iov,
&nr_segs, pos, &kiocb->ki_pos, count, ocount);
ssize_t wrote = zpl_aio_write_direct(kiocb, iov, nr_segs, pos);

if (wrote >= 0 || wrote != -EAGAIN) {
/*
* generic_file_direct_write() will update
* kiocb->ki_pos on a successful Direct IO write.
*/
IMPLY(wrote >= 0, (pos + wrote) == kiocb->ki_pos);
return (wrote);
}

Expand All @@ -835,35 +792,37 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,

#endif /* HAVE_VFS_RW_ITERATE */

static ssize_t
zpl_direct_IO_impl(void)
{
/*
* All O_DIRCT requests should be handled by
* zpl_{iter/aio}_{write/read}(). There is no way kernel generic code
* should call the direct_IO address_space_operations function. We set
* this code path to be fatal if it is executed.
*/
VERIFY(0);
return (0);
}

#if defined(HAVE_VFS_RW_ITERATE)
#if defined(HAVE_VFS_DIRECT_IO_ITER)
static ssize_t
zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
{
if (iov_iter_rw(iter) == WRITE)
return (zpl_iter_write_direct(kiocb, iter));
else
return (zpl_iter_read(kiocb, iter));
return (zpl_direct_IO_impl());
}
#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
static ssize_t
zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
{
ASSERT3S(pos, ==, kiocb->ki_pos);
if (iov_iter_rw(iter) == WRITE)
return (zpl_iter_write_direct(kiocb, iter));
else
return (zpl_iter_read(kiocb, iter));
return (zpl_direct_IO_impl());
}
#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
static ssize_t
zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
{
ASSERT3S(pos, ==, kiocb->ki_pos);
if (rw == WRITE)
return (zpl_iter_write_direct(kiocb, iter));
else
return (zpl_iter_read(kiocb, iter));
return (zpl_direct_IO_impl());
}
#else
#error "Unknown direct IO interface"
Expand All @@ -876,23 +835,13 @@ static ssize_t
zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov,
loff_t pos, unsigned long nr_segs)
{
if (rw == WRITE)
return (zpl_aio_write_direct(kiocb, iov, nr_segs, pos));
else
return (zpl_aio_read(kiocb, iov, nr_segs, pos));
return (zpl_direct_IO_impl());
}
#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
static ssize_t
zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
{
const struct iovec *iovp = iov_iter_iovec(iter);
unsigned long nr_segs = iter->nr_segs;

ASSERT3S(pos, ==, kiocb->ki_pos);
if (rw == WRITE)
return (zpl_aio_write_direct(kiocb, iovp, nr_segs, pos));
else
return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
return (zpl_direct_IO_impl());
}
#else
#error "Unknown direct IO interface"
Expand Down
Loading

0 comments on commit 1d585f8

Please sign in to comment.