/* * zbud.c - Compression buddies allocator * * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. * * Compression buddies ("zbud") provides for efficiently packing two * (or, possibly in the future, more) compressed pages ("zpages") into * a single "raw" pageframe and for tracking both zpages and pageframes * so that whole pageframes can be easily reclaimed in LRU-like order. * It is designed to be used in conjunction with transcendent memory * ("tmem"); for example separate LRU lists are maintained for persistent * vs. ephemeral pages. * * A zbudpage is an overlay for a struct page and thus each zbudpage * refers to a physical pageframe of RAM. When the caller passes a * struct page from the kernel's page allocator, zbud "transforms" it * to a zbudpage which sets/uses a different set of fields than the * struct-page and thus must "untransform" it back by reinitializing * certain fields before the struct-page can be freed. The fields * of a zbudpage include a page lock for controlling access to the * corresponding pageframe, and there is a size field for each zpage. * Each zbudpage also lives on two linked lists: a "budlist" which is * used to support efficient buddying of zpages; and an "lru" which * is used for reclaiming pageframes in approximately least-recently-used * order. * * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks" * which contain the compressed data for zero, one, or two zbuds. Contained * with the compressed data is a tmem_handle which is a key to allow * the same data to be found via the tmem interface so the zpage can * be invalidated (for ephemeral pages) or repatriated to the swap cache * (for persistent pages). The contents of a zbudpageframe must never * be accessed without holding the page lock for the corresponding * zbudpage and, to accomodate highmem machines, the contents may * only be examined or changes when kmapped. Thus, when in use, a * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg". * * Note that the term "zbud" refers to the combination of a zpage and * a tmem_handle that is stored as one of possibly two "buddied" zpages; * it also generically refers to this allocator... sorry for any confusion. * * A zbudref is a pointer to a struct zbudpage (which can be cast to a * struct page), with the LSB either cleared or set to indicate, respectively, * the first or second zpage in the zbudpageframe. Since a zbudref can be * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely * references a stored tmem page and so is the only zbud data structure * externally visible to zbud.c/zbud.h. * * Since we wish to reclaim entire pageframes but zpages may be randomly * added and deleted to any given pageframe, we approximate LRU by * promoting a pageframe to MRU when a zpage is added to it, but * leaving it at the current place in the list when a zpage is deleted * from it. As a side effect, zpages that are difficult to buddy (e.g. * very large paages) will be reclaimed faster than average, which seems * reasonable. * * In the current implementation, no more than two zpages may be stored in * any pageframe and no zpage ever crosses a pageframe boundary. While * other zpage allocation mechanisms may allow greater density, this two * zpage-per-pageframe limit both ensures simple reclaim of pageframes * (including garbage collection of references to the contents of those * pageframes from tmem data structures) AND avoids the need for compaction. * With additional complexity, zbud could be modified to support storing * up to three zpages per pageframe or, to handle larger average zpages, * up to three zpages per pair of pageframes, but it is not clear if the * additional complexity would be worth it. So consider it an exercise * for future developers. * * Note also that zbud does no page allocation or freeing. This is so * that the caller has complete control over and, for accounting, visibility * into if/when pages are allocated and freed. * * Finally, note that zbud limits the size of zpages it can store; the * caller must check the zpage size with zbud_max_buddy_size before * storing it, else BUGs will result. User beware. */ #include <linux/module.h> #include <linux/highmem.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/bug.h> #include "tmem.h" #include "zcache.h" #include "zbud.h" /* * We need to ensure that a struct zbudpage is never larger than a * struct page. This is checked with a BUG_ON in zbud_init. * * The unevictable field indicates that a zbud is being added to the * zbudpage. Since this is a two-phase process (due to tmem locking), * this field locks the zbudpage against eviction when a zbud match * or creation is in process. Since this addition process may occur * in parallel for two zbuds in one zbudpage, the field is a counter * that must not exceed two. */ struct zbudpage { union { struct page page; struct { unsigned long space_for_flags; struct { unsigned zbud0_size:PAGE_SHIFT; unsigned zbud1_size:PAGE_SHIFT; unsigned unevictable:2; }; struct list_head budlist; struct list_head lru; }; }; }; #if (PAGE_SHIFT * 2) + 2 > BITS_PER_LONG #error "zbud won't work for this arch, PAGE_SIZE is too large" #endif struct zbudref { union { struct zbudpage *zbudpage; unsigned long zbudref; }; }; #define CHUNK_SHIFT 6 #define CHUNK_SIZE (1 << CHUNK_SHIFT) #define CHUNK_MASK (~(CHUNK_SIZE-1)) #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) #define MAX_CHUNK (NCHUNKS-1) /* * The following functions deal with the difference between struct * page and struct zbudpage. Note the hack of using the pageflags * from struct page; this is to avoid duplicating all the complex * pageflag macros. */ static inline void zbudpage_spin_lock(struct zbudpage *zbudpage) { struct page *page = (struct page *)zbudpage; while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) { do { cpu_relax(); } while (test_bit(PG_locked, &page->flags)); } } static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage) { struct page *page = (struct page *)zbudpage; clear_bit(PG_locked, &page->flags); } static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage) { return trylock_page((struct page *)zbudpage); } static inline int zbudpage_is_locked(struct zbudpage *zbudpage) { return PageLocked((struct page *)zbudpage); } static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage) { return kmap_atomic((struct page *)zbudpage); } /* * A dying zbudpage is an ephemeral page in the process of being evicted. * Any data contained in the zbudpage is invalid and we are just waiting for * the tmem pampds to be invalidated before freeing the page */ static inline int zbudpage_is_dying(struct zbudpage *zbudpage) { struct page *page = (struct page *)zbudpage; return test_bit(PG_reclaim, &page->flags); } static inline void zbudpage_set_dying(struct zbudpage *zbudpage) { struct page *page = (struct page *)zbudpage; set_bit(PG_reclaim, &page->flags); } static inline void zbudpage_clear_dying(struct zbudpage *zbudpage) { struct page *page = (struct page *)zbudpage; clear_bit(PG_reclaim, &page->flags); } /* * A zombie zbudpage is a persistent page in the process of being evicted. * The data contained in the zbudpage is valid and we are just waiting for * the tmem pampds to be invalidated before freeing the page */ static inline int zbudpage_is_zombie(struct zbudpage *zbudpage) { struct page *page = (struct page *)zbudpage; return test_bit(PG_dirty, &page->flags); } static inline void zbudpage_set_zombie(struct zbudpage *zbudpage) { struct page *page = (struct page *)zbudpage; set_bit(PG_dirty, &page->flags); } static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage) { struct page *page = (struct page *)zbudpage; clear_bit(PG_dirty, &page->flags); } static inline void kunmap_zbudpage_atomic(void *zbpg) { kunmap_atomic(zbpg); } /* * zbud "translation" and helper functions */ static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref) { unsigned long zbud = (unsigned long)zref; zbud &= ~1UL; return (struct zbudpage *)zbud; } static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage, unsigned budnum) { unsigned long zbud = (unsigned long)zbudpage; BUG_ON(budnum > 1); zbud |= budnum; return (struct zbudref *)zbud; } static inline int zbudref_budnum(struct zbudref *zbudref) { unsigned long zbud = (unsigned long)zbudref; return zbud & 1UL; } static inline unsigned zbud_max_size(void) { return MAX_CHUNK << CHUNK_SHIFT; } static inline unsigned zbud_size_to_chunks(unsigned size) { BUG_ON(size == 0 || size > zbud_max_size()); return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; } /* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */ static inline char *zbud_data(void *zbpg, unsigned budnum, unsigned size) { char *p; BUG_ON(size == 0 || size > zbud_max_size()); p = (char *)zbpg; if (budnum == 1) p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); return p; } /* * These are all informative and exposed through debugfs... except for * the arrays... anyone know how to do that? To avoid confusion for * debugfs viewers, some of these should also be atomic_long_t, but * I don't know how to expose atomics via debugfs either... */ static ssize_t zbud_eph_pageframes; static ssize_t zbud_pers_pageframes; static ssize_t zbud_eph_zpages; static ssize_t zbud_pers_zpages; static u64 zbud_eph_zbytes; static u64 zbud_pers_zbytes; static ssize_t zbud_eph_evicted_pageframes; static ssize_t zbud_pers_evicted_pageframes; static ssize_t zbud_eph_cumul_zpages; static ssize_t zbud_pers_cumul_zpages; static u64 zbud_eph_cumul_zbytes; static u64 zbud_pers_cumul_zbytes; static ssize_t zbud_eph_cumul_chunk_counts[NCHUNKS]; static ssize_t zbud_pers_cumul_chunk_counts[NCHUNKS]; static ssize_t zbud_eph_buddied_count; static ssize_t zbud_pers_buddied_count; static ssize_t zbud_eph_unbuddied_count; static ssize_t zbud_pers_unbuddied_count; static ssize_t zbud_eph_zombie_count; static ssize_t zbud_pers_zombie_count; static atomic_t zbud_eph_zombie_atomic; static atomic_t zbud_pers_zombie_atomic; #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #define zdfs debugfs_create_size_t #define zdfs64 debugfs_create_u64 static int zbud_debugfs_init(void) { struct dentry *root = debugfs_create_dir("zbud", NULL); if (root == NULL) return -ENXIO; /* * would be nice to dump the sizes of the unbuddied * arrays, like was done with sysfs, but it doesn't * look like debugfs is flexible enough to do that */ zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes); zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes); zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes); zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes); zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages); zdfs("eph_evicted_pageframes", S_IRUGO, root, &zbud_eph_evicted_pageframes); zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages); zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes); zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count); zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count); zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages); zdfs("pers_evicted_pageframes", S_IRUGO, root, &zbud_pers_evicted_pageframes); zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages); zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes); zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count); zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count); zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count); return 0; } #undef zdfs #undef zdfs64 #else static inline int zbud_debugfs_init(void) { return 0; } #endif /* protects the buddied list and all unbuddied lists */ static DEFINE_SPINLOCK(zbud_eph_lists_lock); static DEFINE_SPINLOCK(zbud_pers_lists_lock); struct zbud_unbuddied { struct list_head list; unsigned count; }; /* list N contains pages with N chunks USED and NCHUNKS-N unused */ /* element 0 is never used but optimizing that isn't worth it */ static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS]; static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS]; static LIST_HEAD(zbud_eph_lru_list); static LIST_HEAD(zbud_pers_lru_list); static LIST_HEAD(zbud_eph_buddied_list); static LIST_HEAD(zbud_pers_buddied_list); static LIST_HEAD(zbud_eph_zombie_list); static LIST_HEAD(zbud_pers_zombie_list); /* * Given a struct page, transform it to a zbudpage so that it can be * used by zbud and initialize fields as necessary. */ static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph) { struct zbudpage *zbudpage = (struct zbudpage *)page; BUG_ON(page == NULL); INIT_LIST_HEAD(&zbudpage->budlist); INIT_LIST_HEAD(&zbudpage->lru); zbudpage->zbud0_size = 0; zbudpage->zbud1_size = 0; zbudpage->unevictable = 0; if (eph) zbud_eph_pageframes++; else zbud_pers_pageframes++; return zbudpage; } /* "Transform" a zbudpage back to a struct page suitable to free. */ static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage, bool eph) { struct page *page = (struct page *)zbudpage; BUG_ON(!list_empty(&zbudpage->budlist)); BUG_ON(!list_empty(&zbudpage->lru)); BUG_ON(zbudpage->zbud0_size != 0); BUG_ON(zbudpage->zbud1_size != 0); BUG_ON(!PageLocked(page)); BUG_ON(zbudpage->unevictable != 0); BUG_ON(zbudpage_is_dying(zbudpage)); BUG_ON(zbudpage_is_zombie(zbudpage)); if (eph) zbud_eph_pageframes--; else zbud_pers_pageframes--; zbudpage_spin_unlock(zbudpage); page_mapcount_reset(page); init_page_count(page); page->index = 0; return page; } /* Mark a zbud as unused and do accounting */ static inline void zbud_unuse_zbud(struct zbudpage *zbudpage, int budnum, bool eph) { unsigned size; BUG_ON(!zbudpage_is_locked(zbudpage)); if (budnum == 0) { size = zbudpage->zbud0_size; zbudpage->zbud0_size = 0; } else { size = zbudpage->zbud1_size; zbudpage->zbud1_size = 0; } if (eph) { zbud_eph_zbytes -= size; zbud_eph_zpages--; } else { zbud_pers_zbytes -= size; zbud_pers_zpages--; } } /* * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer * to some data, set up the zbud appropriately including data copying * and accounting. Note that if cdata is NULL, the data copying is * skipped. (This is useful for lazy writes such as for RAMster.) */ static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th, bool eph, void *cdata, unsigned budnum, unsigned size) { char *to; void *zbpg; struct tmem_handle *to_th; unsigned nchunks = zbud_size_to_chunks(size); BUG_ON(!zbudpage_is_locked(zbudpage)); zbpg = kmap_zbudpage_atomic(zbudpage); to = zbud_data(zbpg, budnum, size); to_th = (struct tmem_handle *)to; to_th->index = th->index; to_th->oid = th->oid; to_th->pool_id = th->pool_id; to_th->client_id = th->client_id; to += sizeof(struct tmem_handle); if (cdata != NULL) memcpy(to, cdata, size - sizeof(struct tmem_handle)); kunmap_zbudpage_atomic(zbpg); if (budnum == 0) zbudpage->zbud0_size = size; else zbudpage->zbud1_size = size; if (eph) { zbud_eph_cumul_chunk_counts[nchunks]++; zbud_eph_zpages++; zbud_eph_cumul_zpages++; zbud_eph_zbytes += size; zbud_eph_cumul_zbytes += size; } else { zbud_pers_cumul_chunk_counts[nchunks]++; zbud_pers_zpages++; zbud_pers_cumul_zpages++; zbud_pers_zbytes += size; zbud_pers_cumul_zbytes += size; } } /* * Given a locked dying zbudpage, read out the tmem handles from the data, * unlock the page, then use the handles to tell tmem to flush out its * references */ static void zbud_evict_tmem(struct zbudpage *zbudpage) { int i, j; uint32_t pool_id[2], client_id[2]; uint32_t index[2]; struct tmem_oid oid[2]; struct tmem_pool *pool; void *zbpg; struct tmem_handle *th; unsigned size; /* read out the tmem handles from the data and set aside */ zbpg = kmap_zbudpage_atomic(zbudpage); for (i = 0, j = 0; i < 2; i++) { size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size; if (size) { th = (struct tmem_handle *)zbud_data(zbpg, i, size); client_id[j] = th->client_id; pool_id[j] = th->pool_id; oid[j] = th->oid; index[j] = th->index; j++; zbud_unuse_zbud(zbudpage, i, true); } } kunmap_zbudpage_atomic(zbpg); zbudpage_spin_unlock(zbudpage); /* zbudpage is now an unlocked dying... tell tmem to flush pointers */ for (i = 0; i < j; i++) { pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); if (pool != NULL) { tmem_flush_page(pool, &oid[i], index[i]); zcache_put_pool(pool); } } } /* * Externally callable zbud handling routines. */ /* * Return the maximum size compressed page that can be stored (secretly * setting aside space for the tmem handle. */ unsigned int zbud_max_buddy_size(void) { return zbud_max_size() - sizeof(struct tmem_handle); } /* * Given a zbud reference, free the corresponding zbud from all lists, * mark it as unused, do accounting, and if the freeing of the zbud * frees up an entire pageframe, return it to the caller (else NULL). */ struct page *zbud_free_and_delist(struct zbudref *zref, bool eph, unsigned int *zsize, unsigned int *zpages) { unsigned long budnum = zbudref_budnum(zref); struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); struct page *page = NULL; unsigned chunks, bud_size, other_bud_size; spinlock_t *lists_lock = eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; struct zbud_unbuddied *unbud = eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; spin_lock(lists_lock); zbudpage_spin_lock(zbudpage); if (zbudpage_is_dying(zbudpage)) { /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ zbudpage_spin_unlock(zbudpage); spin_unlock(lists_lock); *zpages = 0; *zsize = 0; goto out; } if (budnum == 0) { bud_size = zbudpage->zbud0_size; other_bud_size = zbudpage->zbud1_size; } else { bud_size = zbudpage->zbud1_size; other_bud_size = zbudpage->zbud0_size; } *zsize = bud_size - sizeof(struct tmem_handle); *zpages = 1; zbud_unuse_zbud(zbudpage, budnum, eph); if (other_bud_size == 0) { /* was unbuddied: unlist and free */ chunks = zbud_size_to_chunks(bud_size) ; if (zbudpage_is_zombie(zbudpage)) { if (eph) zbud_pers_zombie_count = atomic_dec_return(&zbud_eph_zombie_atomic); else zbud_pers_zombie_count = atomic_dec_return(&zbud_pers_zombie_atomic); zbudpage_clear_zombie(zbudpage); } else { BUG_ON(list_empty(&unbud[chunks].list)); list_del_init(&zbudpage->budlist); unbud[chunks].count--; } list_del_init(&zbudpage->lru); spin_unlock(lists_lock); if (eph) zbud_eph_unbuddied_count--; else zbud_pers_unbuddied_count--; page = zbud_unuse_zbudpage(zbudpage, eph); } else { /* was buddied: move remaining buddy to unbuddied list */ chunks = zbud_size_to_chunks(other_bud_size) ; if (!zbudpage_is_zombie(zbudpage)) { list_del_init(&zbudpage->budlist); list_add_tail(&zbudpage->budlist, &unbud[chunks].list); unbud[chunks].count++; } if (eph) { zbud_eph_buddied_count--; zbud_eph_unbuddied_count++; } else { zbud_pers_unbuddied_count++; zbud_pers_buddied_count--; } /* don't mess with lru, no need to move it */ zbudpage_spin_unlock(zbudpage); spin_unlock(lists_lock); } out: return page; } /* * Given a tmem handle, and a kmapped pointer to compressed data of * the given size, try to find an unbuddied zbudpage in which to * create a zbud. If found, put it there, mark the zbudpage unevictable, * and return a zbudref to it. Else return NULL. */ struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph, void *cdata, unsigned size) { struct zbudpage *zbudpage = NULL, *zbudpage2; unsigned long budnum = 0UL; unsigned nchunks; int i, found_good_buddy = 0; spinlock_t *lists_lock = eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; struct zbud_unbuddied *unbud = eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; size += sizeof(struct tmem_handle); nchunks = zbud_size_to_chunks(size); for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { spin_lock(lists_lock); if (!list_empty(&unbud[i].list)) { list_for_each_entry_safe(zbudpage, zbudpage2, &unbud[i].list, budlist) { if (zbudpage_spin_trylock(zbudpage)) { found_good_buddy = i; goto found_unbuddied; } } } spin_unlock(lists_lock); } zbudpage = NULL; goto out; found_unbuddied: BUG_ON(!zbudpage_is_locked(zbudpage)); BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0))); if (zbudpage->zbud0_size == 0) budnum = 0UL; else if (zbudpage->zbud1_size == 0) budnum = 1UL; list_del_init(&zbudpage->budlist); if (eph) { list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list); unbud[found_good_buddy].count--; zbud_eph_unbuddied_count--; zbud_eph_buddied_count++; /* "promote" raw zbudpage to most-recently-used */ list_del_init(&zbudpage->lru); list_add_tail(&zbudpage->lru, &zbud_eph_lru_list); } else { list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list); unbud[found_good_buddy].count--; zbud_pers_unbuddied_count--; zbud_pers_buddied_count++; /* "promote" raw zbudpage to most-recently-used */ list_del_init(&zbudpage->lru); list_add_tail(&zbudpage->lru, &zbud_pers_lru_list); } zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size); zbudpage->unevictable++; BUG_ON(zbudpage->unevictable == 3); zbudpage_spin_unlock(zbudpage); spin_unlock(lists_lock); out: return zbudpage_to_zbudref(zbudpage, budnum); } /* * Given a tmem handle, and a kmapped pointer to compressed data of * the given size, and a newly allocated struct page, create an unevictable * zbud in that new page and return a zbudref to it. */ struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph, void *cdata, unsigned size, struct page *newpage) { struct zbudpage *zbudpage; unsigned long budnum = 0; unsigned nchunks; spinlock_t *lists_lock = eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; struct zbud_unbuddied *unbud = eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; #if 0 /* this may be worth it later to support decompress-in-place? */ static unsigned long counter; budnum = counter++ & 1; /* alternate using zbud0 and zbud1 */ #endif if (size > zbud_max_buddy_size()) return NULL; if (newpage == NULL) return NULL; size += sizeof(struct tmem_handle); nchunks = zbud_size_to_chunks(size) ; spin_lock(lists_lock); zbudpage = zbud_init_zbudpage(newpage, eph); zbudpage_spin_lock(zbudpage); list_add_tail(&zbudpage->budlist, &unbud[nchunks].list); if (eph) { list_add_tail(&zbudpage->lru, &zbud_eph_lru_list); zbud_eph_unbuddied_count++; } else { list_add_tail(&zbudpage->lru, &zbud_pers_lru_list); zbud_pers_unbuddied_count++; } unbud[nchunks].count++; zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size); zbudpage->unevictable++; BUG_ON(zbudpage->unevictable == 3); zbudpage_spin_unlock(zbudpage); spin_unlock(lists_lock); return zbudpage_to_zbudref(zbudpage, budnum); } /* * Finish creation of a zbud by, assuming another zbud isn't being created * in parallel, marking it evictable. */ void zbud_create_finish(struct zbudref *zref, bool eph) { struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); spinlock_t *lists_lock = eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; spin_lock(lists_lock); zbudpage_spin_lock(zbudpage); BUG_ON(zbudpage_is_dying(zbudpage)); zbudpage->unevictable--; BUG_ON((int)zbudpage->unevictable < 0); zbudpage_spin_unlock(zbudpage); spin_unlock(lists_lock); } /* * Given a zbudref and a struct page, decompress the data from * the zbud into the physical page represented by the struct page * by upcalling to zcache_decompress */ int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph, void (*decompress)(char *, unsigned int, char *)) { struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); unsigned long budnum = zbudref_budnum(zref); void *zbpg; char *to_va, *from_va; unsigned size; int ret = -1; spinlock_t *lists_lock = eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; spin_lock(lists_lock); zbudpage_spin_lock(zbudpage); if (zbudpage_is_dying(zbudpage)) { /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ goto out; } zbpg = kmap_zbudpage_atomic(zbudpage); to_va = kmap_atomic(data_page); if (budnum == 0) size = zbudpage->zbud0_size; else size = zbudpage->zbud1_size; BUG_ON(size == 0 || size > zbud_max_size()); from_va = zbud_data(zbpg, budnum, size); from_va += sizeof(struct tmem_handle); size -= sizeof(struct tmem_handle); decompress(from_va, size, to_va); kunmap_atomic(to_va); kunmap_zbudpage_atomic(zbpg); ret = 0; out: zbudpage_spin_unlock(zbudpage); spin_unlock(lists_lock); return ret; } /* * Given a zbudref and a kernel pointer, copy the data from * the zbud to the kernel pointer. */ int zbud_copy_from_zbud(char *to_va, struct zbudref *zref, size_t *sizep, bool eph) { struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); unsigned long budnum = zbudref_budnum(zref); void *zbpg; char *from_va; unsigned size; int ret = -1; spinlock_t *lists_lock = eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; spin_lock(lists_lock); zbudpage_spin_lock(zbudpage); if (zbudpage_is_dying(zbudpage)) { /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ goto out; } zbpg = kmap_zbudpage_atomic(zbudpage); if (budnum == 0) size = zbudpage->zbud0_size; else size = zbudpage->zbud1_size; BUG_ON(size == 0 || size > zbud_max_size()); from_va = zbud_data(zbpg, budnum, size); from_va += sizeof(struct tmem_handle); size -= sizeof(struct tmem_handle); *sizep = size; memcpy(to_va, from_va, size); kunmap_zbudpage_atomic(zbpg); ret = 0; out: zbudpage_spin_unlock(zbudpage); spin_unlock(lists_lock); return ret; } /* * Given a zbudref and a kernel pointer, copy the data from * the kernel pointer to the zbud. */ int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph) { struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); unsigned long budnum = zbudref_budnum(zref); void *zbpg; char *to_va; unsigned size; int ret = -1; spinlock_t *lists_lock = eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; spin_lock(lists_lock); zbudpage_spin_lock(zbudpage); if (zbudpage_is_dying(zbudpage)) { /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ goto out; } zbpg = kmap_zbudpage_atomic(zbudpage); if (budnum == 0) size = zbudpage->zbud0_size; else size = zbudpage->zbud1_size; BUG_ON(size == 0 || size > zbud_max_size()); to_va = zbud_data(zbpg, budnum, size); to_va += sizeof(struct tmem_handle); size -= sizeof(struct tmem_handle); memcpy(to_va, from_va, size); kunmap_zbudpage_atomic(zbpg); ret = 0; out: zbudpage_spin_unlock(zbudpage); spin_unlock(lists_lock); return ret; } /* * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure * there are no references to it remaining, and return the now unused * (and re-init'ed) struct page and the total amount of compressed * data that was evicted. */ struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages) { struct zbudpage *zbudpage = NULL, *zbudpage2; struct zbud_unbuddied *unbud = zbud_eph_unbuddied; struct page *page = NULL; bool irqs_disabled = irqs_disabled(); /* * Since this can be called indirectly from cleancache_put, which * has interrupts disabled, as well as frontswap_put, which does not, * we need to be able to handle both cases, even though it is ugly. */ if (irqs_disabled) spin_lock(&zbud_eph_lists_lock); else spin_lock_bh(&zbud_eph_lists_lock); *zsize = 0; if (list_empty(&zbud_eph_lru_list)) goto unlock_out; list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) { /* skip a locked zbudpage */ if (unlikely(!zbudpage_spin_trylock(zbudpage))) continue; /* skip an unevictable zbudpage */ if (unlikely(zbudpage->unevictable != 0)) { zbudpage_spin_unlock(zbudpage); continue; } /* got a locked evictable page */ goto evict_page; } unlock_out: /* no unlocked evictable pages, give up */ if (irqs_disabled) spin_unlock(&zbud_eph_lists_lock); else spin_unlock_bh(&zbud_eph_lists_lock); goto out; evict_page: list_del_init(&zbudpage->budlist); list_del_init(&zbudpage->lru); zbudpage_set_dying(zbudpage); /* * the zbudpage is now "dying" and attempts to read, write, * or delete data from it will be ignored */ if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size != 0) { *zsize = zbudpage->zbud0_size + zbudpage->zbud1_size - (2 * sizeof(struct tmem_handle)); *zpages = 2; } else if (zbudpage->zbud0_size != 0) { unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--; *zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle); *zpages = 1; } else if (zbudpage->zbud1_size != 0) { unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--; *zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle); *zpages = 1; } else { BUG(); } spin_unlock(&zbud_eph_lists_lock); zbud_eph_evicted_pageframes++; if (*zpages == 1) zbud_eph_unbuddied_count--; else zbud_eph_buddied_count--; zbud_evict_tmem(zbudpage); zbudpage_spin_lock(zbudpage); zbudpage_clear_dying(zbudpage); page = zbud_unuse_zbudpage(zbudpage, true); if (!irqs_disabled) local_bh_enable(); out: return page; } /* * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it, * read the tmem_handle(s) out of it into the passed array, and return the * number of zbuds. Caller must perform necessary tmem functions and, * indirectly, zbud functions to fetch any valid data and cause the * now-zombified zbudpage to eventually be freed. We track the zombified * zbudpage count so it is possible to observe if there is a leak. FIXME: describe (ramster) case where data pointers are passed in for memcpy */ unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data, unsigned int *zsize, bool eph) { struct zbudpage *zbudpage = NULL, *zbudpag2; struct tmem_handle *thfrom; char *from_va; void *zbpg; unsigned size; int ret = 0, i; spinlock_t *lists_lock = eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; struct list_head *lru_list = eph ? &zbud_eph_lru_list : &zbud_pers_lru_list; spin_lock_bh(lists_lock); if (list_empty(lru_list)) goto out; list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) { /* skip a locked zbudpage */ if (unlikely(!zbudpage_spin_trylock(zbudpage))) continue; /* skip an unevictable zbudpage */ if (unlikely(zbudpage->unevictable != 0)) { zbudpage_spin_unlock(zbudpage); continue; } /* got a locked evictable page */ goto zombify_page; } /* no unlocked evictable pages, give up */ goto out; zombify_page: /* got an unlocked evictable page, zombify it */ list_del_init(&zbudpage->budlist); zbudpage_set_zombie(zbudpage); /* FIXME what accounting do I need to do here? */ list_del_init(&zbudpage->lru); if (eph) { list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list); zbud_eph_zombie_count = atomic_inc_return(&zbud_eph_zombie_atomic); } else { list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list); zbud_pers_zombie_count = atomic_inc_return(&zbud_pers_zombie_atomic); } /* FIXME what accounting do I need to do here? */ zbpg = kmap_zbudpage_atomic(zbudpage); for (i = 0; i < 2; i++) { size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size; if (size) { from_va = zbud_data(zbpg, i, size); thfrom = (struct tmem_handle *)from_va; from_va += sizeof(struct tmem_handle); size -= sizeof(struct tmem_handle); if (th != NULL) th[ret] = *thfrom; if (data != NULL) memcpy(data[ret], from_va, size); if (zsize != NULL) *zsize++ = size; ret++; } } kunmap_zbudpage_atomic(zbpg); zbudpage_spin_unlock(zbudpage); out: spin_unlock_bh(lists_lock); return ret; } void zbud_init(void) { int i; zbud_debugfs_init(); BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE)); BUG_ON(sizeof(struct zbudpage) > sizeof(struct page)); for (i = 0; i < NCHUNKS; i++) { INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list); INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list); } }