Skip to content

Commit

Permalink
mm: remember young/dirty bit for page migrations
Browse files Browse the repository at this point in the history
When page migration happens, we always ignore the young/dirty bit settings
in the old pgtable, and marking the page as old in the new page table
using either pte_mkold() or pmd_mkold(), and keeping the pte clean.

That's fine from functional-wise, but that's not friendly to page reclaim
because the moving page can be actively accessed within the procedure. 
Not to mention hardware setting the young bit can bring quite some
overhead on some systems, e.g.  x86_64 needs a few hundreds nanoseconds to
set the bit.  The same slowdown problem to dirty bits when the memory is
first written after page migration happened.

Actually we can easily remember the A/D bit configuration and recover the
information after the page is migrated.  To achieve it, define a new set
of bits in the migration swap offset field to cache the A/D bits for old
pte.  Then when removing/recovering the migration entry, we can recover
the A/D bits even if the page changed.

One thing to mention is that here we used max_swapfile_size() to detect
how many swp offset bits we have, and we'll only enable this feature if we
know the swp offset is big enough to store both the PFN value and the A/D
bits.  Otherwise the A/D bits are dropped like before.

Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Peter Xu <[email protected]>
Reviewed-by: "Huang, Ying" <[email protected]>
Cc: Alistair Popple <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: David Hildenbrand <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: "Kirill A . Shutemov" <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Nadav Amit <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Dave Hansen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
  • Loading branch information
xzpeter authored and akpm00 committed Sep 27, 2022
1 parent 0ccf7f1 commit 2e34687
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 4 deletions.
99 changes: 99 additions & 0 deletions include/linux/swapops.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@

#ifdef CONFIG_MMU

#ifdef CONFIG_SWAP
#include <linux/swapfile.h>
#endif /* CONFIG_SWAP */

/*
* swapcache pages are stored in the swapper_space radix tree. We want to
* get good packing density in that tree, so the index should be dense in
Expand Down Expand Up @@ -35,6 +39,31 @@
#endif /* MAX_PHYSMEM_BITS */
#define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1)

/**
* Migration swap entry specific bitfield definitions. Layout:
*
* |----------+--------------------|
* | swp_type | swp_offset |
* |----------+--------+-+-+-------|
* | | resv |D|A| PFN |
* |----------+--------+-+-+-------|
*
* @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A)
* @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D)
*
* Note: A/D bits will be stored in migration entries iff there're enough
* free bits in arch specific swp offset. By default we'll ignore A/D bits
* when migrating a page. Please refer to migration_entry_supports_ad()
* for more information. If there're more bits besides PFN and A/D bits,
* they should be reserved and always be zeros.
*/
#define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS)
#define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1)
#define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2)

#define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT)
#define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT)

static inline bool is_pfn_swap_entry(swp_entry_t entry);

/* Clear all flags but only keep swp_entry_t related information */
Expand Down Expand Up @@ -265,6 +294,57 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
return swp_entry(SWP_MIGRATION_WRITE, offset);
}

/*
* Returns whether the host has large enough swap offset field to support
* carrying over pgtable A/D bits for page migrations. The result is
* pretty much arch specific.
*/
static inline bool migration_entry_supports_ad(void)
{
/*
* max_swapfile_size() returns the max supported swp-offset plus 1.
* We can support the migration A/D bits iff the pfn swap entry has
* the offset large enough to cover all of them (PFN, A & D bits).
*/
#ifdef CONFIG_SWAP
return max_swapfile_size() >= (1UL << SWP_MIG_TOTAL_BITS);
#else /* CONFIG_SWAP */
return false;
#endif /* CONFIG_SWAP */
}

static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
{
if (migration_entry_supports_ad())
return swp_entry(swp_type(entry),
swp_offset(entry) | SWP_MIG_YOUNG);
return entry;
}

static inline bool is_migration_entry_young(swp_entry_t entry)
{
if (migration_entry_supports_ad())
return swp_offset(entry) & SWP_MIG_YOUNG;
/* Keep the old behavior of aging page after migration */
return false;
}

static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
if (migration_entry_supports_ad())
return swp_entry(swp_type(entry),
swp_offset(entry) | SWP_MIG_DIRTY);
return entry;
}

static inline bool is_migration_entry_dirty(swp_entry_t entry)
{
if (migration_entry_supports_ad())
return swp_offset(entry) & SWP_MIG_DIRTY;
/* Keep the old behavior of clean page after migration */
return false;
}

extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl);
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
Expand Down Expand Up @@ -311,6 +391,25 @@ static inline int is_readable_migration_entry(swp_entry_t entry)
return 0;
}

static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
{
return entry;
}

static inline bool is_migration_entry_young(swp_entry_t entry)
{
return false;
}

static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
return entry;
}

static inline bool is_migration_entry_dirty(swp_entry_t entry)
{
return false;
}
#endif /* CONFIG_MIGRATION */

typedef unsigned long pte_marker;
Expand Down
18 changes: 16 additions & 2 deletions mm/huge_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -2121,7 +2121,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = is_writable_migration_entry(entry);
if (PageAnon(page))
anon_exclusive = is_readable_exclusive_migration_entry(entry);
young = false;
young = is_migration_entry_young(entry);
dirty = is_migration_entry_dirty(entry);
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
Expand Down Expand Up @@ -2183,6 +2184,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
else
swp_entry = make_readable_migration_entry(
page_to_pfn(page + i));
if (young)
swp_entry = make_migration_entry_young(swp_entry);
if (dirty)
swp_entry = make_migration_entry_dirty(swp_entry);
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
Expand Down Expand Up @@ -3201,6 +3206,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
else
entry = make_readable_migration_entry(page_to_pfn(page));
if (pmd_young(pmdval))
entry = make_migration_entry_young(entry);
if (pmd_dirty(pmdval))
entry = make_migration_entry_dirty(entry);
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
Expand All @@ -3226,13 +3235,18 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)

entry = pmd_to_swp_entry(*pvmw->pmd);
get_page(new);
pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)));
pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
if (!is_migration_entry_young(entry))
pmde = pmd_mkold(pmde);
/* NOTE: this may contain setting soft-dirty on some archs */
if (PageDirty(new) && is_migration_entry_dirty(entry))
pmde = pmd_mkdirty(pmde);

if (PageAnon(new)) {
rmap_t rmap_flags = RMAP_COMPOUND;
Expand Down
6 changes: 5 additions & 1 deletion mm/migrate.c
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,18 @@ static bool remove_migration_pte(struct folio *folio,
#endif

folio_get(folio);
pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
if (pte_swp_soft_dirty(*pvmw.pte))
pte = pte_mksoft_dirty(pte);

/*
* Recheck VMA as permissions can change since migration started
*/
entry = pte_to_swp_entry(*pvmw.pte);
if (!is_migration_entry_young(entry))
pte = pte_mkold(pte);
if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
pte = pte_mkdirty(pte);
if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
Expand Down
6 changes: 6 additions & 0 deletions mm/migrate_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,12 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
else
entry = make_readable_migration_entry(
page_to_pfn(page));
if (pte_present(pte)) {
if (pte_young(pte))
entry = make_migration_entry_young(entry);
if (pte_dirty(pte))
entry = make_migration_entry_dirty(entry);
}
swp_pte = swp_entry_to_pte(entry);
if (pte_present(pte)) {
if (pte_soft_dirty(pte))
Expand Down
5 changes: 4 additions & 1 deletion mm/rmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2066,7 +2066,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));

if (pte_young(pteval))
entry = make_migration_entry_young(entry);
if (pte_dirty(pteval))
entry = make_migration_entry_dirty(entry);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
Expand Down

0 comments on commit 2e34687

Please sign in to comment.