244 lines
8.2 KiB
Diff
244 lines
8.2 KiB
Diff
From 397624e12244ec038f51cb1f178ccb7a2ec562e5 Mon Sep 17 00:00:00 2001
|
|
From: "T.J. Alumbaugh" <talumbau@google.com>
|
|
Date: Wed, 18 Jan 2023 00:18:23 +0000
|
|
Subject: [PATCH 14/19] UPSTREAM: mm: multi-gen LRU: section for Bloom filters
|
|
|
|
Move Bloom filters code into a dedicated section. Improve the design doc
|
|
to explain Bloom filter usage and connection between aging and eviction in
|
|
their use.
|
|
|
|
Link: https://lkml.kernel.org/r/20230118001827.1040870-4-talumbau@google.com
|
|
Change-Id: I73e866f687c1ed9f5c8538086aa39408b79897db
|
|
Signed-off-by: T.J. Alumbaugh <talumbau@google.com>
|
|
Cc: Yu Zhao <yuzhao@google.com>
|
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|
(cherry picked from commit ccbbbb85945d8f0255aa9dbc1b617017e2294f2c)
|
|
Bug: 274865848
|
|
Signed-off-by: T.J. Mercier <tjmercier@google.com>
|
|
---
|
|
Documentation/mm/multigen_lru.rst | 16 +++
|
|
mm/vmscan.c | 180 +++++++++++++++---------------
|
|
2 files changed, 108 insertions(+), 88 deletions(-)
|
|
|
|
--- a/Documentation/mm/multigen_lru.rst
|
|
+++ b/Documentation/mm/multigen_lru.rst
|
|
@@ -170,6 +170,22 @@ promotes hot pages. If the scan was done
|
|
adds the PMD entry pointing to the PTE table to the Bloom filter. This
|
|
forms a feedback loop between the eviction and the aging.
|
|
|
|
+Bloom Filters
|
|
+-------------
|
|
+Bloom filters are a space and memory efficient data structure for set
|
|
+membership test, i.e., test if an element is not in the set or may be
|
|
+in the set.
|
|
+
|
|
+In the eviction path, specifically, in ``lru_gen_look_around()``, if a
|
|
+PMD has a sufficient number of hot pages, its address is placed in the
|
|
+filter. In the aging path, set membership means that the PTE range
|
|
+will be scanned for young pages.
|
|
+
|
|
+Note that Bloom filters are probabilistic on set membership. If a test
|
|
+is false positive, the cost is an additional scan of a range of PTEs,
|
|
+which may yield hot pages anyway. Parameters of the filter itself can
|
|
+control the false positive rate in the limit.
|
|
+
|
|
Summary
|
|
-------
|
|
The multi-gen LRU can be disassembled into the following parts:
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -3209,6 +3209,98 @@ static bool __maybe_unused seq_is_valid(
|
|
}
|
|
|
|
/******************************************************************************
|
|
+ * Bloom filters
|
|
+ ******************************************************************************/
|
|
+
|
|
+/*
|
|
+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
|
|
+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
|
|
+ * bits in a bitmap, k is the number of hash functions and n is the number of
|
|
+ * inserted items.
|
|
+ *
|
|
+ * Page table walkers use one of the two filters to reduce their search space.
|
|
+ * To get rid of non-leaf entries that no longer have enough leaf entries, the
|
|
+ * aging uses the double-buffering technique to flip to the other filter each
|
|
+ * time it produces a new generation. For non-leaf entries that have enough
|
|
+ * leaf entries, the aging carries them over to the next generation in
|
|
+ * walk_pmd_range(); the eviction also report them when walking the rmap
|
|
+ * in lru_gen_look_around().
|
|
+ *
|
|
+ * For future optimizations:
|
|
+ * 1. It's not necessary to keep both filters all the time. The spare one can be
|
|
+ * freed after the RCU grace period and reallocated if needed again.
|
|
+ * 2. And when reallocating, it's worth scaling its size according to the number
|
|
+ * of inserted entries in the other filter, to reduce the memory overhead on
|
|
+ * small systems and false positives on large systems.
|
|
+ * 3. Jenkins' hash function is an alternative to Knuth's.
|
|
+ */
|
|
+#define BLOOM_FILTER_SHIFT 15
|
|
+
|
|
+static inline int filter_gen_from_seq(unsigned long seq)
|
|
+{
|
|
+ return seq % NR_BLOOM_FILTERS;
|
|
+}
|
|
+
|
|
+static void get_item_key(void *item, int *key)
|
|
+{
|
|
+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
|
|
+
|
|
+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
|
|
+
|
|
+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
|
|
+ key[1] = hash >> BLOOM_FILTER_SHIFT;
|
|
+}
|
|
+
|
|
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
|
+{
|
|
+ int key[2];
|
|
+ unsigned long *filter;
|
|
+ int gen = filter_gen_from_seq(seq);
|
|
+
|
|
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
|
|
+ if (!filter)
|
|
+ return true;
|
|
+
|
|
+ get_item_key(item, key);
|
|
+
|
|
+ return test_bit(key[0], filter) && test_bit(key[1], filter);
|
|
+}
|
|
+
|
|
+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
|
+{
|
|
+ int key[2];
|
|
+ unsigned long *filter;
|
|
+ int gen = filter_gen_from_seq(seq);
|
|
+
|
|
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
|
|
+ if (!filter)
|
|
+ return;
|
|
+
|
|
+ get_item_key(item, key);
|
|
+
|
|
+ if (!test_bit(key[0], filter))
|
|
+ set_bit(key[0], filter);
|
|
+ if (!test_bit(key[1], filter))
|
|
+ set_bit(key[1], filter);
|
|
+}
|
|
+
|
|
+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
|
|
+{
|
|
+ unsigned long *filter;
|
|
+ int gen = filter_gen_from_seq(seq);
|
|
+
|
|
+ filter = lruvec->mm_state.filters[gen];
|
|
+ if (filter) {
|
|
+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
|
|
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
|
+ WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
* mm_struct list
|
|
******************************************************************************/
|
|
|
|
@@ -3333,94 +3425,6 @@ void lru_gen_migrate_mm(struct mm_struct
|
|
}
|
|
#endif
|
|
|
|
-/*
|
|
- * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
|
|
- * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
|
|
- * bits in a bitmap, k is the number of hash functions and n is the number of
|
|
- * inserted items.
|
|
- *
|
|
- * Page table walkers use one of the two filters to reduce their search space.
|
|
- * To get rid of non-leaf entries that no longer have enough leaf entries, the
|
|
- * aging uses the double-buffering technique to flip to the other filter each
|
|
- * time it produces a new generation. For non-leaf entries that have enough
|
|
- * leaf entries, the aging carries them over to the next generation in
|
|
- * walk_pmd_range(); the eviction also report them when walking the rmap
|
|
- * in lru_gen_look_around().
|
|
- *
|
|
- * For future optimizations:
|
|
- * 1. It's not necessary to keep both filters all the time. The spare one can be
|
|
- * freed after the RCU grace period and reallocated if needed again.
|
|
- * 2. And when reallocating, it's worth scaling its size according to the number
|
|
- * of inserted entries in the other filter, to reduce the memory overhead on
|
|
- * small systems and false positives on large systems.
|
|
- * 3. Jenkins' hash function is an alternative to Knuth's.
|
|
- */
|
|
-#define BLOOM_FILTER_SHIFT 15
|
|
-
|
|
-static inline int filter_gen_from_seq(unsigned long seq)
|
|
-{
|
|
- return seq % NR_BLOOM_FILTERS;
|
|
-}
|
|
-
|
|
-static void get_item_key(void *item, int *key)
|
|
-{
|
|
- u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
|
|
-
|
|
- BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
|
|
-
|
|
- key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
|
|
- key[1] = hash >> BLOOM_FILTER_SHIFT;
|
|
-}
|
|
-
|
|
-static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
|
|
-{
|
|
- unsigned long *filter;
|
|
- int gen = filter_gen_from_seq(seq);
|
|
-
|
|
- filter = lruvec->mm_state.filters[gen];
|
|
- if (filter) {
|
|
- bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
|
|
- return;
|
|
- }
|
|
-
|
|
- filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
|
|
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
|
- WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
|
|
-}
|
|
-
|
|
-static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
|
-{
|
|
- int key[2];
|
|
- unsigned long *filter;
|
|
- int gen = filter_gen_from_seq(seq);
|
|
-
|
|
- filter = READ_ONCE(lruvec->mm_state.filters[gen]);
|
|
- if (!filter)
|
|
- return;
|
|
-
|
|
- get_item_key(item, key);
|
|
-
|
|
- if (!test_bit(key[0], filter))
|
|
- set_bit(key[0], filter);
|
|
- if (!test_bit(key[1], filter))
|
|
- set_bit(key[1], filter);
|
|
-}
|
|
-
|
|
-static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
|
-{
|
|
- int key[2];
|
|
- unsigned long *filter;
|
|
- int gen = filter_gen_from_seq(seq);
|
|
-
|
|
- filter = READ_ONCE(lruvec->mm_state.filters[gen]);
|
|
- if (!filter)
|
|
- return true;
|
|
-
|
|
- get_item_key(item, key);
|
|
-
|
|
- return test_bit(key[0], filter) && test_bit(key[1], filter);
|
|
-}
|
|
-
|
|
static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
|
|
{
|
|
int i;
|