summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Braun <rbraun@sceen.net>2011-12-13 20:27:56 +0000
committerRichard Braun <rbraun@sceen.net>2011-12-17 22:12:34 +0000
commit7bc54a622e0c57a1085cd2990a1deedc8bd4743d (patch)
tree0356aefb0a935c30d295a86cec2386d5197c4754
parentd25bd66fe0bd4cddb18890390198c86b9e9b56b4 (diff)
Import the slab allocator
As it is intended to completely replace the zone allocator, remove it on the way. So long to the venerable code ! * Makefrag.am (libkernel_a_SOURCES): Add kern/slab.{c,h}, remove kern/kalloc.c and kern/zalloc.{c,h}. * configfrag.ac (SLAB_VERIFY, SLAB_USE_CPU_POOLS): Add defines. * i386/Makefrag.am (libkernel_a_SOURCES): Remove i386/i386/zalloc.h. * i386/configfrag.ac (CPU_L1_SHIFT): Remove define. * include/mach_debug/slab_info.h: New file. * kern/slab.c: Likewise. * kern/slab.h: Likewise. * i386/i386/zalloc.h: Remove file. * include/mach_debug/zone_info.h: Likewise. * kern/kalloc.c: Likewise. * kern/zalloc.c: Likewise. * kern/zalloc.h: Likewise.
-rw-r--r--Makefrag.am7
-rw-r--r--configfrag.ac6
-rw-r--r--i386/Makefrag.am1
-rw-r--r--i386/configfrag.ac3
-rw-r--r--i386/i386/zalloc.h29
-rw-r--r--include/mach_debug/slab_info.h (renamed from include/mach_debug/zone_info.h)57
-rw-r--r--kern/kalloc.c254
-rw-r--r--kern/slab.c1576
-rw-r--r--kern/slab.h222
-rw-r--r--kern/zalloc.c1007
-rw-r--r--kern/zalloc.h136
11 files changed, 1839 insertions, 1459 deletions
diff --git a/Makefrag.am b/Makefrag.am
index 07b64997..b3e131a8 100644
--- a/Makefrag.am
+++ b/Makefrag.am
@@ -151,7 +151,6 @@ libkernel_a_SOURCES += \
kern/ipc_sched.h \
kern/ipc_tt.c \
kern/ipc_tt.h \
- kern/kalloc.c \
kern/kalloc.h \
kern/kern_types.h \
kern/list.h \
@@ -180,6 +179,8 @@ libkernel_a_SOURCES += \
kern/rbtree.h \
kern/rbtree_i.h \
kern/refcount.h \
+ kern/slab.c \
+ kern/slab.h \
kern/sched.h \
kern/sched_prim.c \
kern/sched_prim.h \
@@ -204,8 +205,6 @@ libkernel_a_SOURCES += \
kern/timer.h \
kern/xpr.c \
kern/xpr.h \
- kern/zalloc.c \
- kern/zalloc.h \
kern/elf-load.c \
kern/boot_script.c
EXTRA_DIST += \
@@ -406,7 +405,7 @@ include_mach_eXec_HEADERS = \
# mach-debug-headers:= $(addprefix mach_debug/, hash_info.h ipc_info.h \
# mach_debug.defs mach_debug_types.defs mach_debug_types.h \
-# pc_info.h vm_info.h zone_info.h)
+# pc_info.h vm_info.h slab_info.h)
# Other headers for the distribution. We don't install these, because the
# GNU C library has correct versions for users to use.
diff --git a/configfrag.ac b/configfrag.ac
index 77b00248..5f13b63c 100644
--- a/configfrag.ac
+++ b/configfrag.ac
@@ -102,6 +102,12 @@ AC_DEFINE([STAT_TIME], [1], [STAT_TIME])
# Kernel tracing.
AC_DEFINE([XPR_DEBUG], [1], [XPR_DEBUG])
+
+# Slab allocator debugging facilities.
+AC_DEFINE([SLAB_VERIFY], [0], [SLAB_VERIFY])
+
+# Enable the CPU pool layer in the slab allocator.
+AC_DEFINE([SLAB_USE_CPU_POOLS], [0], [SLAB_USE_CPU_POOLS])
#
# Options.
diff --git a/i386/Makefrag.am b/i386/Makefrag.am
index ab3502ac..aca4215d 100644
--- a/i386/Makefrag.am
+++ b/i386/Makefrag.am
@@ -136,7 +136,6 @@ libkernel_a_SOURCES += \
i386/i386/vm_param.h \
i386/i386/vm_tuning.h \
i386/i386/xpr.h \
- i386/i386/zalloc.h \
i386/intel/pmap.c \
i386/intel/pmap.h \
i386/intel/read_fault.c \
diff --git a/i386/configfrag.ac b/i386/configfrag.ac
index e4ce97ef..77f66af9 100644
--- a/i386/configfrag.ac
+++ b/i386/configfrag.ac
@@ -25,6 +25,9 @@ dnl USE OF THIS SOFTWARE.
AC_DEFINE([__ELF__], [1], [__ELF__])
AC_DEFINE([i386], [1], [i386])
+ # Determines the size of the CPU cache line.
+ AC_DEFINE([CPU_L1_SHIFT], [6], [CPU_L1_SHIFT])
+
[# Does the architecture provide machine-specific interfaces?
mach_machine_routines=1;;
*)]
diff --git a/i386/i386/zalloc.h b/i386/i386/zalloc.h
deleted file mode 100644
index bf7cf6b2..00000000
--- a/i386/i386/zalloc.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 1996-1994 The University of Utah and
- * the Computer Systems Laboratory (CSL). All rights reserved.
- *
- * Permission to use, copy, modify and distribute this software is hereby
- * granted provided that (1) source code retains these copyright, permission,
- * and disclaimer notices, and (2) redistributions including binaries
- * reproduce the notices in supporting documentation, and (3) all advertising
- * materials mentioning features or use of this software display the following
- * acknowledgement: ``This product includes software developed by the
- * Computer Systems Laboratory at the University of Utah.''
- *
- * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS
- * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF
- * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- *
- * CSL requests users of this software to return to csl-dist@cs.utah.edu any
- * improvements that they make and grant CSL redistribution rights.
- *
- * Utah $Hdr: zalloc.h 1.4 94/12/16$
- * Author: Bryan Ford
- */
-
-#ifndef _I386_ZALLOC_H_
-#define _I386_ZALLOC_H_
-
-#include <kern/zalloc.h>
-
-#endif /* _I386_ZALLOC_H_ */
diff --git a/include/mach_debug/zone_info.h b/include/mach_debug/slab_info.h
index 1b36fe02..37dcb8c4 100644
--- a/include/mach_debug/zone_info.h
+++ b/include/mach_debug/slab_info.h
@@ -24,38 +24,39 @@
* the rights to redistribute these changes.
*/
-#ifndef _MACH_DEBUG_ZONE_INFO_H_
-#define _MACH_DEBUG_ZONE_INFO_H_
+#ifndef _MACH_DEBUG_SLAB_INFO_H_
+#define _MACH_DEBUG_SLAB_INFO_H_
-#include <mach/boolean.h>
-#include <mach/machine/vm_types.h>
+#include <sys/types.h>
/*
* Remember to update the mig type definitions
* in mach_debug_types.defs when adding/removing fields.
*/
-#define ZONE_NAME_MAX_LEN 80
-
-typedef struct zone_name {
- char zn_name[ZONE_NAME_MAX_LEN];
-} zone_name_t;
-
-typedef zone_name_t *zone_name_array_t;
-
-
-typedef struct zone_info {
- integer_t zi_count; /* Number of elements used now */
- vm_size_t zi_cur_size; /* current memory utilization */
- vm_size_t zi_max_size; /* how large can this zone grow */
- vm_size_t zi_elem_size; /* size of an element */
- vm_size_t zi_alloc_size; /* size used for more memory */
-/*boolean_t*/integer_t zi_pageable; /* zone pageable? */
-/*boolean_t*/integer_t zi_sleepable; /* sleep if empty? */
-/*boolean_t*/integer_t zi_exhaustible; /* merely return if empty? */
-/*boolean_t*/integer_t zi_collectable; /* garbage collect elements? */
-} zone_info_t;
-
-typedef zone_info_t *zone_info_array_t;
-
-#endif /* _MACH_DEBUG_ZONE_INFO_H_ */
+#define CACHE_NAME_MAX_LEN 32
+
+#define CACHE_FLAGS_NO_CPU_POOL 0x01
+#define CACHE_FLAGS_SLAB_EXTERNAL 0x02
+#define CACHE_FLAGS_NO_RECLAIM 0x04
+#define CACHE_FLAGS_VERIFY 0x08
+#define CACHE_FLAGS_DIRECT 0x10
+
+typedef struct cache_info {
+ int flags;
+ size_t cpu_pool_size;
+ size_t obj_size;
+ size_t align;
+ size_t buf_size;
+ size_t slab_size;
+ unsigned long bufs_per_slab;
+ unsigned long nr_objs;
+ unsigned long nr_bufs;
+ unsigned long nr_slabs;
+ unsigned long nr_free_slabs;
+ char name[CACHE_NAME_MAX_LEN];
+} cache_info_t;
+
+typedef cache_info_t *cache_info_array_t;
+
+#endif /* _MACH_DEBUG_SLAB_INFO_H_ */
diff --git a/kern/kalloc.c b/kern/kalloc.c
deleted file mode 100644
index 8256305b..00000000
--- a/kern/kalloc.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University.
- * Copyright (c) 1993,1994 The University of Utah and
- * the Computer Systems Laboratory (CSL).
- * All rights reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF
- * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY
- * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF
- * THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
- * School of Computer Science
- * Carnegie Mellon University
- * Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- * File: kern/kalloc.c
- * Author: Avadis Tevanian, Jr.
- * Date: 1985
- *
- * General kernel memory allocator. This allocator is designed
- * to be used by the kernel to manage dynamic memory fast.
- */
-
-#include <mach/machine/vm_types.h>
-#include <mach/vm_param.h>
-
-#include <kern/debug.h>
-#include <kern/zalloc.h>
-#include <kern/kalloc.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_object.h>
-#include <vm/vm_map.h>
-
-
-
-vm_map_t kalloc_map;
-vm_size_t kalloc_map_size = 64 * 1024 * 1024;
-vm_size_t kalloc_max;
-
-/*
- * All allocations of size less than kalloc_max are rounded to the
- * next highest power of 2. This allocator is built on top of
- * the zone allocator. A zone is created for each potential size
- * that we are willing to get in small blocks.
- *
- * We assume that kalloc_max is not greater than 64K;
- * thus 16 is a safe array size for k_zone and k_zone_name.
- */
-
-int first_k_zone = -1;
-struct zone *k_zone[16];
-static char *k_zone_name[16] = {
- "kalloc.1", "kalloc.2",
- "kalloc.4", "kalloc.8",
- "kalloc.16", "kalloc.32",
- "kalloc.64", "kalloc.128",
- "kalloc.256", "kalloc.512",
- "kalloc.1024", "kalloc.2048",
- "kalloc.4096", "kalloc.8192",
- "kalloc.16384", "kalloc.32768"
-};
-
-/*
- * Max number of elements per zone. zinit rounds things up correctly
- * Doing things this way permits each zone to have a different maximum size
- * based on need, rather than just guessing; it also
- * means its patchable in case you're wrong!
- */
-unsigned long k_zone_max[16] = {
- 1024, /* 1 Byte */
- 1024, /* 2 Byte */
- 1024, /* 4 Byte */
- 1024, /* 8 Byte */
- 1024, /* 16 Byte */
- 4096, /* 32 Byte */
- 4096, /* 64 Byte */
- 4096, /* 128 Byte */
- 4096, /* 256 Byte */
- 1024, /* 512 Byte */
- 1024, /* 1024 Byte */
- 1024, /* 2048 Byte */
- 1024, /* 4096 Byte */
- 4096, /* 8192 Byte */
- 64, /* 16384 Byte */
- 64, /* 32768 Byte */
-};
-
-/*
- * Initialize the memory allocator. This should be called only
- * once on a system wide basis (i.e. first processor to get here
- * does the initialization).
- *
- * This initializes all of the zones.
- */
-
-#ifndef NDEBUG
-static int kalloc_init_called;
-#endif
-
-void kalloc_init()
-{
- vm_offset_t min, max;
- vm_size_t size;
- register int i;
-
- assert (! kalloc_init_called);
-
- kalloc_map = kmem_suballoc(kernel_map, &min, &max,
- kalloc_map_size, FALSE);
-
- /*
- * Ensure that zones up to size 8192 bytes exist.
- * This is desirable because messages are allocated
- * with kalloc, and messages up through size 8192 are common.
- */
-
- if (PAGE_SIZE < 16*1024)
- kalloc_max = 16*1024;
- else
- kalloc_max = PAGE_SIZE;
-
- /*
- * Allocate a zone for each size we are going to handle.
- * We specify non-paged memory.
- */
- for (i = 0, size = 1; size < kalloc_max; i++, size <<= 1) {
- if (size < MINSIZE) {
- k_zone[i] = 0;
- continue;
- }
- if (size == MINSIZE) {
- first_k_zone = i;
- }
- k_zone[i] = zinit(size, 0, k_zone_max[i] * size, size,
- size >= PAGE_SIZE ? ZONE_COLLECTABLE : 0,
- k_zone_name[i]);
- }
-
-#ifndef NDEBUG
- kalloc_init_called = 1;
-#endif
-}
-
-vm_offset_t kalloc(size)
- vm_size_t size;
-{
- register int zindex;
- register vm_size_t allocsize;
- vm_offset_t addr;
-
- /* compute the size of the block that we will actually allocate */
-
- assert (kalloc_init_called);
-
- allocsize = size;
- if (size < kalloc_max) {
- allocsize = MINSIZE;
- zindex = first_k_zone;
- while (allocsize < size) {
- allocsize <<= 1;
- zindex++;
- }
- }
-
- /*
- * If our size is still small enough, check the queue for that size
- * and allocate.
- */
-
- if (allocsize < kalloc_max) {
- addr = zalloc(k_zone[zindex]);
- } else {
- if (kmem_alloc_wired(kalloc_map, &addr, allocsize)
- != KERN_SUCCESS)
- addr = 0;
- }
- return(addr);
-}
-
-vm_offset_t kget(size)
- vm_size_t size;
-{
- register int zindex;
- register vm_size_t allocsize;
- vm_offset_t addr;
-
- assert (kalloc_init_called);
-
- /* compute the size of the block that we will actually allocate */
-
- allocsize = size;
- if (size < kalloc_max) {
- allocsize = MINSIZE;
- zindex = first_k_zone;
- while (allocsize < size) {
- allocsize <<= 1;
- zindex++;
- }
- }
-
- /*
- * If our size is still small enough, check the queue for that size
- * and allocate.
- */
-
- if (allocsize < kalloc_max) {
- addr = zget(k_zone[zindex]);
- } else {
- /* This will never work, so we might as well panic */
- panic("kget");
- }
- return(addr);
-}
-
-void
-kfree(data, size)
- vm_offset_t data;
- vm_size_t size;
-{
- register int zindex;
- register vm_size_t freesize;
-
- assert (kalloc_init_called);
-
- freesize = size;
- if (size < kalloc_max) {
- freesize = MINSIZE;
- zindex = first_k_zone;
- while (freesize < size) {
- freesize <<= 1;
- zindex++;
- }
- }
-
- if (freesize < kalloc_max) {
- zfree(k_zone[zindex], data);
- } else {
- kmem_free(kalloc_map, data, freesize);
- }
-}
diff --git a/kern/slab.c b/kern/slab.c
new file mode 100644
index 00000000..38413e83
--- /dev/null
+++ b/kern/slab.c
@@ -0,0 +1,1576 @@
+/*
+ * Copyright (c) 2009, 2010, 2011 Richard Braun.
+ * Copyright (c) 2011 Maksym Planeta.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * Object caching and general purpose memory allocator.
+ *
+ * This allocator is based on the paper "The Slab Allocator: An Object-Caching
+ * Kernel Memory Allocator" by Jeff Bonwick.
+ *
+ * It allows the allocation of objects (i.e. fixed-size typed buffers) from
+ * caches and is efficient in both space and time. This implementation follows
+ * many of the indications from the paper mentioned. The most notable
+ * differences are outlined below.
+ *
+ * The per-cache self-scaling hash table for buffer-to-bufctl conversion,
+ * described in 3.2.3 "Slab Layout for Large Objects", has been replaced by
+ * a red-black tree storing slabs, sorted by address. The use of a
+ * self-balancing tree for buffer-to-slab conversions provides a few advantages
+ * over a hash table. Unlike a hash table, a BST provides a "lookup nearest"
+ * operation, so obtaining the slab data (whether it is embedded in the slab or
+ * off slab) from a buffer address simply consists of a "lookup nearest towards
+ * 0" tree search. Storing slabs instead of buffers also considerably reduces
+ * the number of elements to retain. Finally, a self-balancing tree is a true
+ * self-scaling data structure, whereas a hash table requires periodic
+ * maintenance and complete resizing, which is expensive. The only drawback is
+ * that releasing a buffer to the slab layer takes logarithmic time instead of
+ * constant time. But as the data set size is kept reasonable (because slabs
+ * are stored instead of buffers) and because the CPU pool layer services most
+ * requests, avoiding many accesses to the slab layer, it is considered an
+ * acceptable tradeoff.
+ *
+ * This implementation uses per-cpu pools of objects, which service most
+ * allocation requests. These pools act as caches (but are named differently
+ * to avoid confusion with CPU caches) that reduce contention on multiprocessor
+ * systems. When a pool is empty and cannot provide an object, it is filled by
+ * transferring multiple objects from the slab layer. The symmetric case is
+ * handled likewise.
+ */
+
+#include <string.h>
+#include <kern/assert.h>
+#include <kern/mach_clock.h>
+#include <kern/printf.h>
+#include <kern/slab.h>
+#include <kern/kalloc.h>
+#include <kern/cpu_number.h>
+#include <mach/vm_param.h>
+#include <mach/machine/vm_types.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_types.h>
+#include <sys/types.h>
+
+#ifdef MACH_DEBUG
+#include <mach_debug/slab_info.h>
+#endif
+
+/*
+ * Utility macros.
+ */
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define P2ALIGNED(x, a) (((x) & ((a) - 1)) == 0)
+#define ISP2(x) P2ALIGNED(x, x)
+#define P2ALIGN(x, a) ((x) & -(a))
+#define P2ROUND(x, a) (-(-(x) & -(a)))
+#define P2END(x, a) (-(~(x) & -(a)))
+#define likely(expr) __builtin_expect(!!(expr), 1)
+#define unlikely(expr) __builtin_expect(!!(expr), 0)
+
+/*
+ * Minimum required alignment.
+ */
+#define KMEM_ALIGN_MIN 8
+
+/*
+ * Minimum number of buffers per slab.
+ *
+ * This value is ignored when the slab size exceeds a threshold.
+ */
+#define KMEM_MIN_BUFS_PER_SLAB 8
+
+/*
+ * Special slab size beyond which the minimum number of buffers per slab is
+ * ignored when computing the slab size of a cache.
+ */
+#define KMEM_SLAB_SIZE_THRESHOLD (8 * PAGE_SIZE)
+
+/*
+ * Special buffer size under which slab data is unconditionnally allocated
+ * from its associated slab.
+ */
+#define KMEM_BUF_SIZE_THRESHOLD (PAGE_SIZE / 8)
+
+/*
+ * Time (in seconds) between two garbage collection operations.
+ */
+#define KMEM_GC_INTERVAL (1 * hz)
+
+/*
+ * The transfer size of a CPU pool is computed by dividing the pool size by
+ * this value.
+ */
+#define KMEM_CPU_POOL_TRANSFER_RATIO 2
+
+/*
+ * Redzone guard word.
+ */
+#ifdef __LP64__
+#if _HOST_BIG_ENDIAN
+#define KMEM_REDZONE_WORD 0xfeedfacefeedfaceUL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_REDZONE_WORD 0xcefaedfecefaedfeUL
+#endif /* _HOST_BIG_ENDIAN */
+#else /* __LP64__ */
+#if _HOST_BIG_ENDIAN
+#define KMEM_REDZONE_WORD 0xfeedfaceUL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_REDZONE_WORD 0xcefaedfeUL
+#endif /* _HOST_BIG_ENDIAN */
+#endif /* __LP64__ */
+
+/*
+ * Redzone byte for padding.
+ */
+#define KMEM_REDZONE_BYTE 0xbb
+
+/*
+ * Size of the VM submap from which default backend functions allocate.
+ */
+#define KMEM_MAP_SIZE (64 * 1024 * 1024)
+
+/*
+ * Shift for the first kalloc cache size.
+ */
+#define KALLOC_FIRST_SHIFT 5
+
+/*
+ * Number of caches backing general purpose allocations.
+ */
+#define KALLOC_NR_CACHES 13
+
+/*
+ * Size of the VM submap for general purpose allocations.
+ */
+#define KALLOC_MAP_SIZE (64 * 1024 * 1024)
+
+/*
+ * Values the buftag state member can take.
+ */
+#ifdef __LP64__
+#if _HOST_BIG_ENDIAN
+#define KMEM_BUFTAG_ALLOC 0xa110c8eda110c8edUL
+#define KMEM_BUFTAG_FREE 0xf4eeb10cf4eeb10cUL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_BUFTAG_ALLOC 0xedc810a1edc810a1UL
+#define KMEM_BUFTAG_FREE 0x0cb1eef40cb1eef4UL
+#endif /* _HOST_BIG_ENDIAN */
+#else /* __LP64__ */
+#if _HOST_BIG_ENDIAN
+#define KMEM_BUFTAG_ALLOC 0xa110c8edUL
+#define KMEM_BUFTAG_FREE 0xf4eeb10cUL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_BUFTAG_ALLOC 0xedc810a1UL
+#define KMEM_BUFTAG_FREE 0x0cb1eef4UL
+#endif /* _HOST_BIG_ENDIAN */
+#endif /* __LP64__ */
+
+/*
+ * Free and uninitialized patterns.
+ *
+ * These values are unconditionnally 64-bit wide since buffers are at least
+ * 8-byte aligned.
+ */
+#if _HOST_BIG_ENDIAN
+#define KMEM_FREE_PATTERN 0xdeadbeefdeadbeefULL
+#define KMEM_UNINIT_PATTERN 0xbaddcafebaddcafeULL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_FREE_PATTERN 0xefbeaddeefbeaddeULL
+#define KMEM_UNINIT_PATTERN 0xfecaddbafecaddbaULL
+#endif /* _HOST_BIG_ENDIAN */
+
+/*
+ * Cache flags.
+ *
+ * The flags don't change once set and can be tested without locking.
+ */
+#define KMEM_CF_NO_CPU_POOL 0x01 /* CPU pool layer disabled */
+#define KMEM_CF_SLAB_EXTERNAL 0x02 /* Slab data is off slab */
+#define KMEM_CF_NO_RECLAIM 0x04 /* Slabs are not reclaimable */
+#define KMEM_CF_VERIFY 0x08 /* Debugging facilities enabled */
+#define KMEM_CF_DIRECT 0x10 /* No buf-to-slab tree lookup */
+
+/*
+ * Options for kmem_cache_alloc_verify().
+ */
+#define KMEM_AV_NOCONSTRUCT 0
+#define KMEM_AV_CONSTRUCT 1
+
+/*
+ * Error codes for kmem_cache_error().
+ */
+#define KMEM_ERR_INVALID 0 /* Invalid address being freed */
+#define KMEM_ERR_DOUBLEFREE 1 /* Freeing already free address */
+#define KMEM_ERR_BUFTAG 2 /* Invalid buftag content */
+#define KMEM_ERR_MODIFIED 3 /* Buffer modified while free */
+#define KMEM_ERR_REDZONE 4 /* Redzone violation */
+
+#if SLAB_USE_CPU_POOLS
+/*
+ * Available CPU pool types.
+ *
+ * For each entry, the CPU pool size applies from the entry buf_size
+ * (excluded) up to (and including) the buf_size of the preceding entry.
+ *
+ * See struct kmem_cpu_pool_type for a description of the values.
+ */
+static struct kmem_cpu_pool_type kmem_cpu_pool_types[] = {
+ { 32768, 1, 0, NULL },
+ { 4096, 8, CPU_L1_SIZE, NULL },
+ { 256, 64, CPU_L1_SIZE, NULL },
+ { 0, 128, CPU_L1_SIZE, NULL }
+};
+
+/*
+ * Caches where CPU pool arrays are allocated from.
+ */
+static struct kmem_cache kmem_cpu_array_caches[ARRAY_SIZE(kmem_cpu_pool_types)];
+#endif /* SLAB_USE_CPU_POOLS */
+
+/*
+ * Cache for off slab data.
+ */
+static struct kmem_cache kmem_slab_cache;
+
+/*
+ * General purpose caches array.
+ */
+static struct kmem_cache kalloc_caches[KALLOC_NR_CACHES];
+
+/*
+ * List of all caches managed by the allocator.
+ */
+static struct list kmem_cache_list;
+static unsigned int kmem_nr_caches;
+static simple_lock_data_t __attribute__((used)) kmem_cache_list_lock;
+
+/*
+ * VM submap for slab caches (except general purpose allocations).
+ */
+static struct vm_map kmem_map_store;
+vm_map_t kmem_map = &kmem_map_store;
+
+/*
+ * VM submap for general purpose allocations.
+ */
+static struct vm_map kalloc_map_store;
+vm_map_t kalloc_map = &kalloc_map_store;
+
+/*
+ * Time of the last memory reclaim, in clock ticks.
+ */
+static unsigned int kmem_gc_last_tick;
+
+#define kmem_error(format, ...) \
+ printf("mem: error: %s(): " format "\n", __func__, \
+ ## __VA_ARGS__)
+
+#define kmem_warn(format, ...) \
+ printf("mem: warning: %s(): " format "\n", __func__, \
+ ## __VA_ARGS__)
+
+#define kmem_print(format, ...) \
+ printf(format "\n", ## __VA_ARGS__)
+
+static void kmem_cache_error(struct kmem_cache *cache, void *buf, int error,
+ void *arg);
+static void * kmem_cache_alloc_from_slab(struct kmem_cache *cache);
+static void kmem_cache_free_to_slab(struct kmem_cache *cache, void *buf);
+
+static void * kmem_buf_verify_bytes(void *buf, void *pattern, size_t size)
+{
+ char *ptr, *pattern_ptr, *end;
+
+ end = buf + size;
+
+ for (ptr = buf, pattern_ptr = pattern; ptr < end; ptr++, pattern_ptr++)
+ if (*ptr != *pattern_ptr)
+ return ptr;
+
+ return NULL;
+}
+
+static void * kmem_buf_verify(void *buf, uint64_t pattern, vm_size_t size)
+{
+ uint64_t *ptr, *end;
+
+ assert(P2ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+ assert(P2ALIGNED(size, sizeof(uint64_t)));
+
+ end = buf + size;
+
+ for (ptr = buf; ptr < end; ptr++)
+ if (*ptr != pattern)
+ return kmem_buf_verify_bytes(ptr, &pattern, sizeof(pattern));
+
+ return NULL;
+}
+
+static void kmem_buf_fill(void *buf, uint64_t pattern, size_t size)
+{
+ uint64_t *ptr, *end;
+
+ assert(P2ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+ assert(P2ALIGNED(size, sizeof(uint64_t)));
+
+ end = buf + size;
+
+ for (ptr = buf; ptr < end; ptr++)
+ *ptr = pattern;
+}
+
+static void * kmem_buf_verify_fill(void *buf, uint64_t old, uint64_t new,
+ size_t size)
+{
+ uint64_t *ptr, *end;
+
+ assert(P2ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+ assert(P2ALIGNED(size, sizeof(uint64_t)));
+
+ end = buf + size;
+
+ for (ptr = buf; ptr < end; ptr++) {
+ if (*ptr != old)
+ return kmem_buf_verify_bytes(ptr, &old, sizeof(old));
+
+ *ptr = new;
+ }
+
+ return NULL;
+}
+
+static inline union kmem_bufctl *
+kmem_buf_to_bufctl(void *buf, struct kmem_cache *cache)
+{
+ return (union kmem_bufctl *)(buf + cache->bufctl_dist);
+}
+
+static inline struct kmem_buftag *
+kmem_buf_to_buftag(void *buf, struct kmem_cache *cache)
+{
+ return (struct kmem_buftag *)(buf + cache->buftag_dist);
+}
+
+static inline void * kmem_bufctl_to_buf(union kmem_bufctl *bufctl,
+ struct kmem_cache *cache)
+{
+ return (void *)bufctl - cache->bufctl_dist;
+}
+
+static vm_offset_t kmem_pagealloc(vm_size_t size)
+{
+ vm_offset_t addr;
+ kern_return_t kr;
+
+ kr = kmem_alloc_wired(kmem_map, &addr, size);
+
+ if (kr != KERN_SUCCESS)
+ return 0;
+
+ return addr;
+}
+
+static void kmem_pagefree(vm_offset_t ptr, vm_size_t size)
+{
+ kmem_free(kmem_map, ptr, size);
+}
+
+static void kmem_slab_create_verify(struct kmem_slab *slab,
+ struct kmem_cache *cache)
+{
+ struct kmem_buftag *buftag;
+ size_t buf_size;
+ unsigned long buffers;
+ void *buf;
+
+ buf_size = cache->buf_size;
+ buf = slab->addr;
+ buftag = kmem_buf_to_buftag(buf, cache);
+
+ for (buffers = cache->bufs_per_slab; buffers != 0; buffers--) {
+ kmem_buf_fill(buf, KMEM_FREE_PATTERN, cache->bufctl_dist);
+ buftag->state = KMEM_BUFTAG_FREE;
+ buf += buf_size;
+ buftag = kmem_buf_to_buftag(buf, cache);
+ }
+}
+
+/*
+ * Create an empty slab for a cache.
+ *
+ * The caller must drop all locks before calling this function.
+ */
+static struct kmem_slab * kmem_slab_create(struct kmem_cache *cache,
+ size_t color)
+{
+ struct kmem_slab *slab;
+ union kmem_bufctl *bufctl;
+ size_t buf_size;
+ unsigned long buffers;
+ void *slab_buf;
+
+ if (cache->slab_alloc_fn == NULL)
+ slab_buf = (void *)kmem_pagealloc(cache->slab_size);
+ else
+ slab_buf = (void *)cache->slab_alloc_fn(cache->slab_size);
+
+ if (slab_buf == NULL)
+ return NULL;
+
+ if (cache->flags & KMEM_CF_SLAB_EXTERNAL) {
+ assert(!(cache->flags & KMEM_CF_NO_RECLAIM));
+ slab = (struct kmem_slab *)kmem_cache_alloc(&kmem_slab_cache);
+
+ if (slab == NULL) {
+ if (cache->slab_free_fn == NULL)
+ kmem_pagefree((vm_offset_t)slab_buf, cache->slab_size);
+ else
+ cache->slab_free_fn((vm_offset_t)slab_buf, cache->slab_size);
+
+ return NULL;
+ }
+ } else {
+ slab = (struct kmem_slab *)(slab_buf + cache->slab_size) - 1;
+ }
+
+ list_node_init(&slab->list_node);
+ rbtree_node_init(&slab->tree_node);
+ slab->nr_refs = 0;
+ slab->first_free = NULL;
+ slab->addr = slab_buf + color;
+
+ buf_size = cache->buf_size;
+ bufctl = kmem_buf_to_bufctl(slab->addr, cache);
+
+ for (buffers = cache->bufs_per_slab; buffers != 0; buffers--) {
+ bufctl->next = slab->first_free;
+ slab->first_free = bufctl;
+ bufctl = (union kmem_bufctl *)((void *)bufctl + buf_size);
+ }
+
+ if (cache->flags & KMEM_CF_VERIFY)
+ kmem_slab_create_verify(slab, cache);
+
+ return slab;
+}
+
+static void kmem_slab_destroy_verify(struct kmem_slab *slab,
+ struct kmem_cache *cache)
+{
+ struct kmem_buftag *buftag;
+ size_t buf_size;
+ unsigned long buffers;
+ void *buf, *addr;
+
+ buf_size = cache->buf_size;
+ buf = slab->addr;
+ buftag = kmem_buf_to_buftag(buf, cache);
+
+ for (buffers = cache->bufs_per_slab; buffers != 0; buffers--) {
+ if (buftag->state != KMEM_BUFTAG_FREE)
+ kmem_cache_error(cache, buf, KMEM_ERR_BUFTAG, buftag);
+
+ addr = kmem_buf_verify(buf, KMEM_FREE_PATTERN, cache->bufctl_dist);
+
+ if (addr != NULL)
+ kmem_cache_error(cache, buf, KMEM_ERR_MODIFIED, addr);
+
+ buf += buf_size;
+ buftag = kmem_buf_to_buftag(buf, cache);
+ }
+}
+
+/*
+ * Destroy a slab.
+ *
+ * The caller must drop all locks before calling this function.
+ */
+static void kmem_slab_destroy(struct kmem_slab *slab, struct kmem_cache *cache)
+{
+ vm_offset_t slab_buf;
+
+ assert(slab->nr_refs == 0);
+ assert(slab->first_free != NULL);
+ assert(!(cache->flags & KMEM_CF_NO_RECLAIM));
+
+ if (cache->flags & KMEM_CF_VERIFY)
+ kmem_slab_destroy_verify(slab, cache);
+
+ slab_buf = (vm_offset_t)P2ALIGN((unsigned long)slab->addr, PAGE_SIZE);
+
+ if (cache->slab_free_fn == NULL)
+ kmem_pagefree(slab_buf, cache->slab_size);
+ else
+ cache->slab_free_fn(slab_buf, cache->slab_size);
+
+ if (cache->flags & KMEM_CF_SLAB_EXTERNAL)
+ kmem_cache_free(&kmem_slab_cache, (vm_offset_t)slab);
+}
+
+static inline int kmem_slab_use_tree(int flags)
+{
+ return !(flags & KMEM_CF_DIRECT) || (flags & KMEM_CF_VERIFY);
+}
+
+static inline int kmem_slab_cmp_lookup(const void *addr,
+ const struct rbtree_node *node)
+{
+ struct kmem_slab *slab;
+
+ slab = rbtree_entry(node, struct kmem_slab, tree_node);
+
+ if (addr == slab->addr)
+ return 0;
+ else if (addr < slab->addr)
+ return -1;
+ else
+ return 1;
+}
+
+static inline int kmem_slab_cmp_insert(const struct rbtree_node *a,
+ const struct rbtree_node *b)
+{
+ struct kmem_slab *slab;
+
+ slab = rbtree_entry(a, struct kmem_slab, tree_node);
+ return kmem_slab_cmp_lookup(slab->addr, b);
+}
+
+#if SLAB_USE_CPU_POOLS
+static void kmem_cpu_pool_init(struct kmem_cpu_pool *cpu_pool,
+ struct kmem_cache *cache)
+{
+ simple_lock_init(&cpu_pool->lock);
+ cpu_pool->flags = cache->flags;
+ cpu_pool->size = 0;
+ cpu_pool->transfer_size = 0;
+ cpu_pool->nr_objs = 0;
+ cpu_pool->array = NULL;
+}
+
+/*
+ * Return a CPU pool.
+ *
+ * This function will generally return the pool matching the CPU running the
+ * calling thread. Because of context switches and thread migration, the
+ * caller might be running on another processor after this function returns.
+ * Although not optimal, this should rarely happen, and it doesn't affect the
+ * allocator operations in any other way, as CPU pools are always valid, and
+ * their access is serialized by a lock.
+ */
+static inline struct kmem_cpu_pool * kmem_cpu_pool_get(struct kmem_cache *cache)
+{
+ return &cache->cpu_pools[cpu_number()];
+}
+
+static inline void kmem_cpu_pool_build(struct kmem_cpu_pool *cpu_pool,
+ struct kmem_cache *cache, void **array)
+{
+ cpu_pool->size = cache->cpu_pool_type->array_size;
+ cpu_pool->transfer_size = (cpu_pool->size
+ + KMEM_CPU_POOL_TRANSFER_RATIO - 1)
+ / KMEM_CPU_POOL_TRANSFER_RATIO;
+ cpu_pool->array = array;
+}
+
+static inline void * kmem_cpu_pool_pop(struct kmem_cpu_pool *cpu_pool)
+{
+ cpu_pool->nr_objs--;
+ return cpu_pool->array[cpu_pool->nr_objs];
+}
+
+static inline void kmem_cpu_pool_push(struct kmem_cpu_pool *cpu_pool, void *obj)
+{
+ cpu_pool->array[cpu_pool->nr_objs] = obj;
+ cpu_pool->nr_objs++;
+}
+
+static int kmem_cpu_pool_fill(struct kmem_cpu_pool *cpu_pool,
+ struct kmem_cache *cache)
+{
+ void *obj;
+ int i;
+
+ simple_lock(&cache->lock);
+
+ for (i = 0; i < cpu_pool->transfer_size; i++) {
+ obj = kmem_cache_alloc_from_slab(cache);
+
+ if (obj == NULL)
+ break;
+
+ kmem_cpu_pool_push(cpu_pool, obj);
+ }
+
+ simple_unlock(&cache->lock);
+
+ return i;
+}
+
+static void kmem_cpu_pool_drain(struct kmem_cpu_pool *cpu_pool,
+ struct kmem_cache *cache)
+{
+ void *obj;
+ int i;
+
+ simple_lock(&cache->lock);
+
+ for (i = cpu_pool->transfer_size; i > 0; i--) {
+ obj = kmem_cpu_pool_pop(cpu_pool);
+ kmem_cache_free_to_slab(cache, obj);
+ }
+
+ simple_unlock(&cache->lock);
+}
+#endif /* SLAB_USE_CPU_POOLS */
+
+static void kmem_cache_error(struct kmem_cache *cache, void *buf, int error,
+ void *arg)
+{
+ struct kmem_buftag *buftag;
+
+ kmem_error("cache: %s, buffer: %p", cache->name, (void *)buf);
+
+ switch(error) {
+ case KMEM_ERR_INVALID:
+ kmem_error("freeing invalid address");
+ break;
+ case KMEM_ERR_DOUBLEFREE:
+ kmem_error("attempting to free the same address twice");
+ break;
+ case KMEM_ERR_BUFTAG:
+ buftag = arg;
+ kmem_error("invalid buftag content, buftag state: %p",
+ (void *)buftag->state);
+ break;
+ case KMEM_ERR_MODIFIED:
+ kmem_error("free buffer modified, fault address: %p, "
+ "offset in buffer: %td", arg, arg - buf);
+ break;
+ case KMEM_ERR_REDZONE:
+ kmem_error("write beyond end of buffer, fault address: %p, "
+ "offset in buffer: %td", arg, arg - buf);
+ break;
+ default:
+ kmem_error("unknown error");
+ }
+
+ /*
+ * Never reached.
+ */
+}
+
+/*
+ * Compute an appropriate slab size for the given cache.
+ *
+ * Once the slab size is known, this function sets the related properties
+ * (buffers per slab and maximum color). It can also set the KMEM_CF_DIRECT
+ * and/or KMEM_CF_SLAB_EXTERNAL flags depending on the resulting layout.
+ */
+static void kmem_cache_compute_sizes(struct kmem_cache *cache, int flags)
+{
+ size_t i, buffers, buf_size, slab_size, free_slab_size, optimal_size;
+ size_t waste, waste_min;
+ int embed, optimal_embed = optimal_embed;
+
+ buf_size = cache->buf_size;
+
+ if (buf_size < KMEM_BUF_SIZE_THRESHOLD)
+ flags |= KMEM_CACHE_NOOFFSLAB;
+
+ i = 0;
+ waste_min = (size_t)-1;
+
+ do {
+ i++;
+ slab_size = P2ROUND(i * buf_size, PAGE_SIZE);
+ free_slab_size = slab_size;
+
+ if (flags & KMEM_CACHE_NOOFFSLAB)
+ free_slab_size -= sizeof(struct kmem_slab);
+
+ buffers = free_slab_size / buf_size;
+ waste = free_slab_size % buf_size;
+
+ if (buffers > i)
+ i = buffers;
+
+ if (flags & KMEM_CACHE_NOOFFSLAB)
+ embed = 1;
+ else if (sizeof(struct kmem_slab) <= waste) {
+ embed = 1;
+ waste -= sizeof(struct kmem_slab);
+ } else {
+ embed = 0;
+ }
+
+ if (waste <= waste_min) {
+ waste_min = waste;
+ optimal_size = slab_size;
+ optimal_embed = embed;
+ }
+ } while ((buffers < KMEM_MIN_BUFS_PER_SLAB)
+ && (slab_size < KMEM_SLAB_SIZE_THRESHOLD));
+
+ assert(!(flags & KMEM_CACHE_NOOFFSLAB) || optimal_embed);
+
+ cache->slab_size = optimal_size;
+ slab_size = cache->slab_size - (optimal_embed
+ ? sizeof(struct kmem_slab)
+ : 0);
+ cache->bufs_per_slab = slab_size / buf_size;
+ cache->color_max = slab_size % buf_size;
+
+ if (cache->color_max >= PAGE_SIZE)
+ cache->color_max = PAGE_SIZE - 1;
+
+ if (optimal_embed) {
+ if (cache->slab_size == PAGE_SIZE)
+ cache->flags |= KMEM_CF_DIRECT;
+ } else {
+ cache->flags |= KMEM_CF_SLAB_EXTERNAL;
+ }
+}
+
+void kmem_cache_init(struct kmem_cache *cache, const char *name,
+ size_t obj_size, size_t align, kmem_cache_ctor_t ctor,
+ kmem_slab_alloc_fn_t slab_alloc_fn,
+ kmem_slab_free_fn_t slab_free_fn, int flags)
+{
+#if SLAB_USE_CPU_POOLS
+ struct kmem_cpu_pool_type *cpu_pool_type;
+ size_t i;
+#endif /* SLAB_USE_CPU_POOLS */
+ size_t buf_size;
+
+#if SLAB_VERIFY
+ cache->flags = KMEM_CF_VERIFY;
+#else /* SLAB_VERIFY */
+ cache->flags = 0;
+#endif /* SLAB_VERIFY */
+
+ if (flags & KMEM_CACHE_NOCPUPOOL)
+ cache->flags |= KMEM_CF_NO_CPU_POOL;
+
+ if (flags & KMEM_CACHE_NORECLAIM) {
+ assert(slab_free_fn == NULL);
+ flags |= KMEM_CACHE_NOOFFSLAB;
+ cache->flags |= KMEM_CF_NO_RECLAIM;
+ }
+
+ if (flags & KMEM_CACHE_VERIFY)
+ cache->flags |= KMEM_CF_VERIFY;
+
+ if (align < KMEM_ALIGN_MIN)
+ align = KMEM_ALIGN_MIN;
+
+ assert(obj_size > 0);
+ assert(ISP2(align));
+ assert(align < PAGE_SIZE);
+
+ buf_size = P2ROUND(obj_size, align);
+
+ simple_lock_init(&cache->lock);
+ list_node_init(&cache->node);
+ list_init(&cache->partial_slabs);
+ list_init(&cache->free_slabs);
+ rbtree_init(&cache->active_slabs);
+ cache->obj_size = obj_size;
+ cache->align = align;
+ cache->buf_size = buf_size;
+ cache->bufctl_dist = buf_size - sizeof(union kmem_bufctl);
+ cache->color = 0;
+ cache->nr_objs = 0;
+ cache->nr_bufs = 0;
+ cache->nr_slabs = 0;
+ cache->nr_free_slabs = 0;
+ cache->ctor = ctor;
+ cache->slab_alloc_fn = slab_alloc_fn;
+ cache->slab_free_fn = slab_free_fn;
+ strncpy(cache->name, name, sizeof(cache->name));
+ cache->name[sizeof(cache->name) - 1] = '\0';
+ cache->buftag_dist = 0;
+ cache->redzone_pad = 0;
+
+ if (cache->flags & KMEM_CF_VERIFY) {
+ cache->bufctl_dist = buf_size;
+ cache->buftag_dist = cache->bufctl_dist + sizeof(union kmem_bufctl);
+ cache->redzone_pad = cache->bufctl_dist - cache->obj_size;
+ buf_size += sizeof(union kmem_bufctl) + sizeof(struct kmem_buftag);
+ buf_size = P2ROUND(buf_size, align);
+ cache->buf_size = buf_size;
+ }
+
+ kmem_cache_compute_sizes(cache, flags);
+
+#if SLAB_USE_CPU_POOLS
+ for (cpu_pool_type = kmem_cpu_pool_types;
+ buf_size <= cpu_pool_type->buf_size;
+ cpu_pool_type++);
+
+ cache->cpu_pool_type = cpu_pool_type;
+
+ for (i = 0; i < ARRAY_SIZE(cache->cpu_pools); i++)
+ kmem_cpu_pool_init(&cache->cpu_pools[i], cache);
+#endif /* SLAB_USE_CPU_POOLS */
+
+ simple_lock(&kmem_cache_list_lock);
+ list_insert_tail(&kmem_cache_list, &cache->node);
+ kmem_nr_caches++;
+ simple_unlock(&kmem_cache_list_lock);
+}
+
+static inline int kmem_cache_empty(struct kmem_cache *cache)
+{
+ return cache->nr_objs == cache->nr_bufs;
+}
+
+static int kmem_cache_grow(struct kmem_cache *cache)
+{
+ struct kmem_slab *slab;
+ size_t color;
+ int empty;
+
+ simple_lock(&cache->lock);
+
+ if (!kmem_cache_empty(cache)) {
+ simple_unlock(&cache->lock);
+ return 1;
+ }
+
+ color = cache->color;
+ cache->color += cache->align;
+
+ if (cache->color > cache->color_max)
+ cache->color = 0;
+
+ simple_unlock(&cache->lock);
+
+ slab = kmem_slab_create(cache, color);
+
+ simple_lock(&cache->lock);
+
+ if (slab != NULL) {
+ list_insert_tail(&cache->free_slabs, &slab->list_node);
+ cache->nr_bufs += cache->bufs_per_slab;
+ cache->nr_slabs++;
+ cache->nr_free_slabs++;
+ }
+
+ /*
+ * Even if our slab creation failed, another thread might have succeeded
+ * in growing the cache.
+ */
+ empty = kmem_cache_empty(cache);
+
+ simple_unlock(&cache->lock);
+
+ return !empty;
+}
+
+static void kmem_cache_reap(struct kmem_cache *cache)
+{
+ struct kmem_slab *slab;
+ struct list dead_slabs;
+
+ if (cache->flags & KMEM_CF_NO_RECLAIM)
+ return;
+
+ list_init(&dead_slabs);
+
+ simple_lock(&cache->lock);
+
+ while (!list_empty(&cache->free_slabs)) {
+ slab = list_first_entry(&cache->free_slabs, struct kmem_slab,
+ list_node);
+ list_remove(&slab->list_node);
+ list_insert(&dead_slabs, &slab->list_node);
+ cache->nr_bufs -= cache->bufs_per_slab;
+ cache->nr_slabs--;
+ cache->nr_free_slabs--;
+ }
+
+ simple_unlock(&cache->lock);
+
+ while (!list_empty(&dead_slabs)) {
+ slab = list_first_entry(&dead_slabs, struct kmem_slab, list_node);
+ list_remove(&slab->list_node);
+ kmem_slab_destroy(slab, cache);
+ }
+}
+
+/*
+ * Allocate a raw (unconstructed) buffer from the slab layer of a cache.
+ *
+ * The cache must be locked before calling this function.
+ */
+static void * kmem_cache_alloc_from_slab(struct kmem_cache *cache)
+{
+ struct kmem_slab *slab;
+ union kmem_bufctl *bufctl;
+
+ if (!list_empty(&cache->partial_slabs))
+ slab = list_first_entry(&cache->partial_slabs, struct kmem_slab,
+ list_node);
+ else if (!list_empty(&cache->free_slabs))
+ slab = list_first_entry(&cache->free_slabs, struct kmem_slab,
+ list_node);
+ else
+ return NULL;
+
+ bufctl = slab->first_free;
+ assert(bufctl != NULL);
+ slab->first_free = bufctl->next;
+ slab->nr_refs++;
+ cache->nr_objs++;
+
+ /*
+ * The slab has become complete.
+ */
+ if (slab->nr_refs == cache->bufs_per_slab) {
+ list_remove(&slab->list_node);
+
+ if (slab->nr_refs == 1)
+ cache->nr_free_slabs--;
+ } else if (slab->nr_refs == 1) {
+ /*
+ * The slab has become partial.
+ */
+ list_remove(&slab->list_node);
+ list_insert_tail(&cache->partial_slabs, &slab->list_node);
+ cache->nr_free_slabs--;
+ } else if (!list_singular(&cache->partial_slabs)) {
+ struct list *node;
+ struct kmem_slab *tmp;
+
+ /*
+ * The slab remains partial. If there are more than one partial slabs,
+ * maintain the list sorted.
+ */
+
+ assert(slab->nr_refs > 1);
+
+ for (node = list_prev(&slab->list_node);
+ !list_end(&cache->partial_slabs, node);
+ node = list_prev(node)) {
+ tmp = list_entry(node, struct kmem_slab, list_node);
+
+ if (tmp->nr_refs >= slab->nr_refs)
+ break;
+ }
+
+ /*
+ * If the direct neighbor was found, the list is already sorted.
+ * If no slab was found, the slab is inserted at the head of the list.
+ */
+ if (node != list_prev(&slab->list_node)) {
+ list_remove(&slab->list_node);
+ list_insert_after(node, &slab->list_node);
+ }
+ }
+
+ if ((slab->nr_refs == 1) && kmem_slab_use_tree(cache->flags))
+ rbtree_insert(&cache->active_slabs, &slab->tree_node,
+ kmem_slab_cmp_insert);
+
+ return kmem_bufctl_to_buf(bufctl, cache);
+}
+
+/*
+ * Release a buffer to the slab layer of a cache.
+ *
+ * The cache must be locked before calling this function.
+ */
+static void kmem_cache_free_to_slab(struct kmem_cache *cache, void *buf)
+{
+ struct kmem_slab *slab;
+ union kmem_bufctl *bufctl;
+
+ if (cache->flags & KMEM_CF_DIRECT) {
+ assert(cache->slab_size == PAGE_SIZE);
+ slab = (struct kmem_slab *)P2END((unsigned long)buf, cache->slab_size)
+ - 1;
+ } else {
+ struct rbtree_node *node;
+
+ node = rbtree_lookup_nearest(&cache->active_slabs, buf,
+ kmem_slab_cmp_lookup, RBTREE_LEFT);
+ assert(node != NULL);
+ slab = rbtree_entry(node, struct kmem_slab, tree_node);
+ assert((unsigned long)buf < (P2ALIGN((unsigned long)slab->addr
+ + cache->slab_size, PAGE_SIZE)));
+ }
+
+ assert(slab->nr_refs >= 1);
+ assert(slab->nr_refs <= cache->bufs_per_slab);
+ bufctl = kmem_buf_to_bufctl(buf, cache);
+ bufctl->next = slab->first_free;
+ slab->first_free = bufctl;
+ slab->nr_refs--;
+ cache->nr_objs--;
+
+ /*
+ * The slab has become free.
+ */
+ if (slab->nr_refs == 0) {
+ if (kmem_slab_use_tree(cache->flags))
+ rbtree_remove(&cache->active_slabs, &slab->tree_node);
+
+ /*
+ * The slab was partial.
+ */
+ if (cache->bufs_per_slab > 1)
+ list_remove(&slab->list_node);
+
+ list_insert_tail(&cache->free_slabs, &slab->list_node);
+ cache->nr_free_slabs++;
+ } else if (slab->nr_refs == (cache->bufs_per_slab - 1)) {
+ /*
+ * The slab has become partial.
+ */
+ list_insert(&cache->partial_slabs, &slab->list_node);
+ } else if (!list_singular(&cache->partial_slabs)) {
+ struct list *node;
+ struct kmem_slab *tmp;
+
+ /*
+ * The slab remains partial. If there are more than one partial slabs,
+ * maintain the list sorted.
+ */
+
+ assert(slab->nr_refs > 0);
+
+ for (node = list_next(&slab->list_node);
+ !list_end(&cache->partial_slabs, node);
+ node = list_next(node)) {
+ tmp = list_entry(node, struct kmem_slab, list_node);
+
+ if (tmp->nr_refs <= slab->nr_refs)
+ break;
+ }
+
+ /*
+ * If the direct neighbor was found, the list is already sorted.
+ * If no slab was found, the slab is inserted at the tail of the list.
+ */
+ if (node != list_next(&slab->list_node)) {
+ list_remove(&slab->list_node);
+ list_insert_before(node, &slab->list_node);
+ }
+ }
+}
+
+static void kmem_cache_alloc_verify(struct kmem_cache *cache, void *buf,
+ int construct)
+{
+ struct kmem_buftag *buftag;
+ union kmem_bufctl *bufctl;
+ void *addr;
+
+ buftag = kmem_buf_to_buftag(buf, cache);
+
+ if (buftag->state != KMEM_BUFTAG_FREE)
+ kmem_cache_error(cache, buf, KMEM_ERR_BUFTAG, buftag);
+
+ addr = kmem_buf_verify_fill(buf, KMEM_FREE_PATTERN, KMEM_UNINIT_PATTERN,
+ cache->bufctl_dist);
+
+ if (addr != NULL)
+ kmem_cache_error(cache, buf, KMEM_ERR_MODIFIED, addr);
+
+ addr = buf + cache->obj_size;
+ memset(addr, KMEM_REDZONE_BYTE, cache->redzone_pad);
+
+ bufctl = kmem_buf_to_bufctl(buf, cache);
+ bufctl->redzone = KMEM_REDZONE_WORD;
+ buftag->state = KMEM_BUFTAG_ALLOC;
+
+ if (construct && (cache->ctor != NULL))
+ cache->ctor(buf);
+}
+
+vm_offset_t kmem_cache_alloc(struct kmem_cache *cache)
+{
+ int filled;
+ void *buf;
+
+#if SLAB_USE_CPU_POOLS
+ struct kmem_cpu_pool *cpu_pool;
+
+ cpu_pool = kmem_cpu_pool_get(cache);
+
+ if (cpu_pool->flags & KMEM_CF_NO_CPU_POOL)
+ goto slab_alloc;
+
+ simple_lock(&cpu_pool->lock);
+
+fast_alloc:
+ if (likely(cpu_pool->nr_objs > 0)) {
+ buf = kmem_cpu_pool_pop(cpu_pool);
+ simple_unlock(&cpu_pool->lock);
+
+ if (cpu_pool->flags & KMEM_CF_VERIFY)
+ kmem_cache_alloc_verify(cache, buf, KMEM_AV_CONSTRUCT);
+
+ return (vm_offset_t)buf;
+ }
+
+ if (cpu_pool->array != NULL) {
+ filled = kmem_cpu_pool_fill(cpu_pool, cache);
+
+ if (!filled) {
+ simple_unlock(&cpu_pool->lock);
+
+ filled = kmem_cache_grow(cache);
+
+ if (!filled)
+ return 0;
+
+ simple_lock(&cpu_pool->lock);
+ }
+
+ goto fast_alloc;
+ }
+
+ simple_unlock(&cpu_pool->lock);
+#endif /* SLAB_USE_CPU_POOLS */
+
+slab_alloc:
+ simple_lock(&cache->lock);
+ buf = kmem_cache_alloc_from_slab(cache);
+ simple_unlock(&cache->lock);
+
+ if (buf == NULL) {
+ filled = kmem_cache_grow(cache);
+
+ if (!filled)
+ return 0;
+
+ goto slab_alloc;
+ }
+
+ if (cache->flags & KMEM_CF_VERIFY)
+ kmem_cache_alloc_verify(cache, buf, KMEM_AV_NOCONSTRUCT);
+
+ if (cache->ctor != NULL)
+ cache->ctor(buf);
+
+ return (vm_offset_t)buf;
+}
+
+static void kmem_cache_free_verify(struct kmem_cache *cache, void *buf)
+{
+ struct rbtree_node *node;
+ struct kmem_buftag *buftag;
+ struct kmem_slab *slab;
+ union kmem_bufctl *bufctl;
+ unsigned char *redzone_byte;
+ unsigned long slabend;
+
+ simple_lock(&cache->lock);
+ node = rbtree_lookup_nearest(&cache->active_slabs, buf,
+ kmem_slab_cmp_lookup, RBTREE_LEFT);
+ simple_unlock(&cache->lock);
+
+ if (node == NULL)
+ kmem_cache_error(cache, buf, KMEM_ERR_INVALID, NULL);
+
+ slab = rbtree_entry(node, struct kmem_slab, tree_node);
+ slabend = P2ALIGN((unsigned long)slab->addr + cache->slab_size, PAGE_SIZE);
+
+ if ((unsigned long)buf >= slabend)
+ kmem_cache_error(cache, buf, KMEM_ERR_INVALID, NULL);
+
+ if ((((unsigned long)buf - (unsigned long)slab->addr) % cache->buf_size)
+ != 0)
+ kmem_cache_error(cache, buf, KMEM_ERR_INVALID, NULL);
+
+ /*
+ * As the buffer address is valid, accessing its buftag is safe.
+ */
+ buftag = kmem_buf_to_buftag(buf, cache);
+
+ if (buftag->state != KMEM_BUFTAG_ALLOC) {
+ if (buftag->state == KMEM_BUFTAG_FREE)
+ kmem_cache_error(cache, buf, KMEM_ERR_DOUBLEFREE, NULL);
+ else
+ kmem_cache_error(cache, buf, KMEM_ERR_BUFTAG, buftag);
+ }
+
+ redzone_byte = buf + cache->obj_size;
+ bufctl = kmem_buf_to_bufctl(buf, cache);
+
+ while (redzone_byte < (unsigned char *)bufctl) {
+ if (*redzone_byte != KMEM_REDZONE_BYTE)
+ kmem_cache_error(cache, buf, KMEM_ERR_REDZONE, redzone_byte);
+
+ redzone_byte++;
+ }
+
+ if (bufctl->redzone != KMEM_REDZONE_WORD) {
+ unsigned long word;
+
+ word = KMEM_REDZONE_WORD;
+ redzone_byte = kmem_buf_verify_bytes(&bufctl->redzone, &word,
+ sizeof(bufctl->redzone));
+ kmem_cache_error(cache, buf, KMEM_ERR_REDZONE, redzone_byte);
+ }
+
+ kmem_buf_fill(buf, KMEM_FREE_PATTERN, cache->bufctl_dist);
+ buftag->state = KMEM_BUFTAG_FREE;
+}
+
+void kmem_cache_free(struct kmem_cache *cache, vm_offset_t obj)
+{
+#if SLAB_USE_CPU_POOLS
+ struct kmem_cpu_pool *cpu_pool;
+ void **array;
+
+ cpu_pool = kmem_cpu_pool_get(cache);
+
+ if (cpu_pool->flags & KMEM_CF_VERIFY) {
+#else /* SLAB_USE_CPU_POOLS */
+ if (cache->flags & KMEM_CF_VERIFY) {
+#endif /* SLAB_USE_CPU_POOLS */
+ kmem_cache_free_verify(cache, (void *)obj);
+ }
+
+#if SLAB_USE_CPU_POOLS
+ if (cpu_pool->flags & KMEM_CF_NO_CPU_POOL)
+ goto slab_free;
+
+ simple_lock(&cpu_pool->lock);
+
+fast_free:
+ if (likely(cpu_pool->nr_objs < cpu_pool->size)) {
+ kmem_cpu_pool_push(cpu_pool, (void *)obj);
+ simple_unlock(&cpu_pool->lock);
+ return;
+ }
+
+ if (cpu_pool->array != NULL) {
+ kmem_cpu_pool_drain(cpu_pool, cache);
+ goto fast_free;
+ }
+
+ simple_unlock(&cpu_pool->lock);
+
+ array = (void *)kmem_cache_alloc(cache->cpu_pool_type->array_cache);
+
+ if (array != NULL) {
+ simple_lock(&cpu_pool->lock);
+
+ /*
+ * Another thread may have built the CPU pool while the lock was
+ * dropped.
+ */
+ if (cpu_pool->array != NULL) {
+ simple_unlock(&cpu_pool->lock);
+ kmem_cache_free(cache->cpu_pool_type->array_cache,
+ (vm_offset_t)array);
+ goto fast_free;
+ }
+
+ kmem_cpu_pool_build(cpu_pool, cache, array);
+ goto fast_free;
+ }
+
+slab_free:
+#endif /* SLAB_USE_CPU_POOLS */
+
+ kmem_cache_free_to_slab(cache, (void *)obj);
+}
+
+void slab_collect(void)
+{
+ struct kmem_cache *cache;
+
+ if (sched_tick <= (kmem_gc_last_tick + KMEM_GC_INTERVAL))
+ return;
+
+ kmem_gc_last_tick = sched_tick;
+
+ simple_lock(&mem_cache_list_lock);
+
+ list_for_each_entry(&kmem_cache_list, cache, node)
+ kmem_cache_reap(cache);
+
+ simple_unlock(&mem_cache_list_lock);
+}
+
+void slab_bootstrap(void)
+{
+ /* Make sure a bufctl can always be stored in a buffer */
+ assert(sizeof(union kmem_bufctl) <= KMEM_ALIGN_MIN);
+
+ list_init(&kmem_cache_list);
+ simple_lock_init(&kmem_cache_list_lock);
+}
+
+void slab_init(void)
+{
+ vm_offset_t min, max;
+
+#if SLAB_USE_CPU_POOLS
+ struct kmem_cpu_pool_type *cpu_pool_type;
+ char name[KMEM_CACHE_NAME_SIZE];
+ size_t i, size;
+#endif /* SLAB_USE_CPU_POOLS */
+
+ kmem_submap(kmem_map, kernel_map, &min, &max, KMEM_MAP_SIZE, FALSE);
+
+#if SLAB_USE_CPU_POOLS
+ for (i = 0; i < ARRAY_SIZE(kmem_cpu_pool_types); i++) {
+ cpu_pool_type = &kmem_cpu_pool_types[i];
+ cpu_pool_type->array_cache = &kmem_cpu_array_caches[i];
+ sprintf(name, "kmem_cpu_array_%d", cpu_pool_type->array_size);
+ size = sizeof(void *) * cpu_pool_type->array_size;
+ kmem_cache_init(cpu_pool_type->array_cache, name, size,
+ cpu_pool_type->array_align, NULL, NULL, NULL, 0);
+ }
+#endif /* SLAB_USE_CPU_POOLS */
+
+ /*
+ * Prevent off slab data for the slab cache to avoid infinite recursion.
+ */
+ kmem_cache_init(&kmem_slab_cache, "kmem_slab", sizeof(struct kmem_slab),
+ 0, NULL, NULL, NULL, KMEM_CACHE_NOOFFSLAB);
+}
+
+static vm_offset_t kalloc_pagealloc(vm_size_t size)
+{
+ vm_offset_t addr;
+ kern_return_t kr;
+
+ kr = kmem_alloc_wired(kalloc_map, &addr, size);
+
+ if (kr != KERN_SUCCESS)
+ return 0;
+
+ return addr;
+}
+
+static void kalloc_pagefree(vm_offset_t ptr, vm_size_t size)
+{
+ kmem_free(kalloc_map, ptr, size);
+}
+
+void kalloc_init(void)
+{
+ char name[KMEM_CACHE_NAME_SIZE];
+ size_t i, size;
+ vm_offset_t min, max;
+
+ kmem_submap(kalloc_map, kernel_map, &min, &max, KALLOC_MAP_SIZE, FALSE);
+
+ size = 1 << KALLOC_FIRST_SHIFT;
+
+ for (i = 0; i < ARRAY_SIZE(kalloc_caches); i++) {
+ sprintf(name, "kalloc_%u", size);
+ kmem_cache_init(&kalloc_caches[i], name, size, 0, NULL,
+ kalloc_pagealloc, kalloc_pagefree, 0);
+ size <<= 1;
+ }
+}
+
+/*
+ * Return the kalloc cache index matching the given allocation size, which
+ * must be strictly greater than 0.
+ */
+static inline size_t kalloc_get_index(unsigned long size)
+{
+ assert(size != 0);
+
+ size = (size - 1) >> KALLOC_FIRST_SHIFT;
+
+ if (size == 0)
+ return 0;
+ else
+ return (sizeof(long) * 8) - __builtin_clzl(size);
+}
+
+static void kalloc_verify(struct kmem_cache *cache, void *buf, size_t size)
+{
+ size_t redzone_size;
+ void *redzone;
+
+ assert(size <= cache->obj_size);
+
+ redzone = buf + size;
+ redzone_size = cache->obj_size - size;
+ memset(redzone, KMEM_REDZONE_BYTE, redzone_size);
+}
+
+vm_offset_t kalloc(vm_size_t size)
+{
+ size_t index;
+ void *buf;
+
+ if (size == 0)
+ return 0;
+
+ index = kalloc_get_index(size);
+
+ if (index < ARRAY_SIZE(kalloc_caches)) {
+ struct kmem_cache *cache;
+
+ cache = &kalloc_caches[index];
+ buf = (void *)kmem_cache_alloc(cache);
+
+ if ((buf != 0) && (cache->flags & KMEM_CF_VERIFY))
+ kalloc_verify(cache, buf, size);
+ } else
+ buf = (void *)kalloc_pagealloc(size);
+
+ return (vm_offset_t)buf;
+}
+
+static void kfree_verify(struct kmem_cache *cache, void *buf, size_t size)
+{
+ unsigned char *redzone_byte, *redzone_end;
+
+ assert(size <= cache->obj_size);
+
+ redzone_byte = buf + size;
+ redzone_end = buf + cache->obj_size;
+
+ while (redzone_byte < redzone_end) {
+ if (*redzone_byte != KMEM_REDZONE_BYTE)
+ kmem_cache_error(cache, buf, KMEM_ERR_REDZONE, redzone_byte);
+
+ redzone_byte++;
+ }
+}
+
+void kfree(vm_offset_t data, vm_size_t size)
+{
+ size_t index;
+
+ if ((data == 0) || (size == 0))
+ return;
+
+ index = kalloc_get_index(size);
+
+ if (index < ARRAY_SIZE(kalloc_caches)) {
+ struct kmem_cache *cache;
+
+ cache = &kalloc_caches[index];
+
+ if (cache->flags & KMEM_CF_VERIFY)
+ kfree_verify(cache, (void *)data, size);
+
+ kmem_cache_free(cache, data);
+ } else {
+ kalloc_pagefree(data, size);
+ }
+}
+
+#if MACH_DEBUG
+kern_return_t host_slab_info(host_t host, cache_info_array_t *infop,
+ unsigned int *infoCntp)
+{
+ struct kmem_cache *cache;
+ cache_info_t *info;
+ unsigned int i, nr_caches;
+ vm_size_t info_size = info_size;
+ kern_return_t kr;
+
+ if (host == HOST_NULL)
+ return KERN_INVALID_HOST;
+
+ /*
+ * Assume the cache list is unaltered once the kernel is ready.
+ */
+
+ simple_lock(&mem_cache_list_lock);
+ nr_caches = kmem_nr_caches;
+ simple_unlock(&mem_cache_list_lock);
+
+ if (nr_caches <= *infoCntp)
+ info = *infop;
+ else {
+ vm_offset_t info_addr;
+
+ info_size = round_page(nr_caches * sizeof(*info));
+ kr = kmem_alloc_pageable(ipc_kernel_map, &info_addr, info_size);
+
+ if (kr != KERN_SUCCESS)
+ return kr;
+
+ info = (cache_info_t *)info_addr;
+ }
+
+ if (info == NULL)
+ return KERN_RESOURCE_SHORTAGE;
+
+ i = 0;
+
+ list_for_each_entry(&kmem_cache_list, cache, node) {
+ simple_lock(&cache_lock);
+ info[i].flags = ((cache->flags & KMEM_CF_NO_CPU_POOL)
+ ? CACHE_FLAGS_NO_CPU_POOL : 0)
+ | ((cache->flags & KMEM_CF_SLAB_EXTERNAL)
+ ? CACHE_FLAGS_SLAB_EXTERNAL : 0)
+ | ((cache->flags & KMEM_CF_NO_RECLAIM)
+ ? CACHE_FLAGS_NO_RECLAIM : 0)
+ | ((cache->flags & KMEM_CF_VERIFY)
+ ? CACHE_FLAGS_VERIFY : 0)
+ | ((cache->flags & KMEM_CF_DIRECT)
+ ? CACHE_FLAGS_DIRECT : 0);
+#if SLAB_USE_CPU_POOLS
+ info[i].cpu_pool_size = cache->cpu_pool_type->array_size;
+#else /* SLAB_USE_CPU_POOLS */
+ info[i].cpu_pool_size = 0;
+#endif /* SLAB_USE_CPU_POOLS */
+ info[i].obj_size = cache->obj_size;
+ info[i].align = cache->align;
+ info[i].buf_size = cache->buf_size;
+ info[i].slab_size = cache->slab_size;
+ info[i].bufs_per_slab = cache->bufs_per_slab;
+ info[i].nr_objs = cache->nr_objs;
+ info[i].nr_bufs = cache->nr_bufs;
+ info[i].nr_slabs = cache->nr_slabs;
+ info[i].nr_free_slabs = cache->nr_free_slabs;
+ strncpy(info[i].name, cache->name, sizeof(info[i].name));
+ info[i].name[sizeof(info[i].name) - 1] = '\0';
+ simple_unlock(&cache->lock);
+
+ i++;
+ }
+
+ if (info != *infop) {
+ vm_map_copy_t copy;
+ vm_size_t used;
+
+ used = nr_caches * sizeof(*info);
+
+ if (used != info_size)
+ memset((char *)info + used, 0, info_size - used);
+
+ kr = vm_map_copyin(ipc_kernel_map, (vm_offset_t)info, used, TRUE,
+ &copy);
+
+ assert(kr == KERN_SUCCESS);
+ *infop = (cache_info_t *)copy;
+ }
+
+ *infoCntp = nr_caches;
+
+ return KERN_SUCCESS;
+}
+#endif /* MACH_DEBUG */
diff --git a/kern/slab.h b/kern/slab.h
new file mode 100644
index 00000000..14c820b9
--- /dev/null
+++ b/kern/slab.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2009, 2010, 2011 Richard Braun.
+ * Copyright (c) 2011 Maksym Planeta.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _KERN_SLAB_H
+#define _KERN_SLAB_H
+
+#include <kern/lock.h>
+#include <kern/list.h>
+#include <kern/rbtree.h>
+#include <mach/machine/vm_types.h>
+#include <sys/types.h>
+#include <vm/vm_types.h>
+
+#if SLAB_USE_CPU_POOLS
+/*
+ * L1 cache line size.
+ */
+#define CPU_L1_SIZE (1 << CPU_L1_SHIFT)
+
+/*
+ * Per-processor cache of pre-constructed objects.
+ *
+ * The flags member is a read-only CPU-local copy of the parent cache flags.
+ */
+struct kmem_cpu_pool {
+ simple_lock_data_t lock;
+ int flags;
+ int size;
+ int transfer_size;
+ int nr_objs;
+ void **array;
+} __attribute__((aligned(CPU_L1_SIZE)));
+
+/*
+ * When a cache is created, its CPU pool type is determined from the buffer
+ * size. For small buffer sizes, many objects can be cached in a CPU pool.
+ * Conversely, for large buffer sizes, this would incur much overhead, so only
+ * a few objects are stored in a CPU pool.
+ */
+struct kmem_cpu_pool_type {
+ size_t buf_size;
+ int array_size;
+ size_t array_align;
+ struct kmem_cache *array_cache;
+};
+#endif /* SLAB_USE_CPU_POOLS */
+
+/*
+ * Buffer descriptor.
+ *
+ * For normal caches (i.e. without SLAB_CF_VERIFY), bufctls are located at the
+ * end of (but inside) each buffer. If SLAB_CF_VERIFY is set, bufctls are
+ * located after each buffer.
+ *
+ * When an object is allocated to a client, its bufctl isn't used. This memory
+ * is instead used for redzoning if cache debugging is in effect.
+ */
+union kmem_bufctl {
+ union kmem_bufctl *next;
+ unsigned long redzone;
+};
+
+/*
+ * Buffer tag.
+ *
+ * This structure is only used for SLAB_CF_VERIFY caches. It is located after
+ * the bufctl and includes information about the state of the buffer it
+ * describes (allocated or not). It should be thought of as a debugging
+ * extension of the bufctl.
+ */
+struct kmem_buftag {
+ unsigned long state;
+};
+
+/*
+ * Page-aligned collection of unconstructed buffers.
+ */
+struct kmem_slab {
+ struct list list_node;
+ struct rbtree_node tree_node;
+ unsigned long nr_refs;
+ union kmem_bufctl *first_free;
+ void *addr;
+};
+
+/*
+ * Type for constructor functions.
+ *
+ * The pre-constructed state of an object is supposed to include only
+ * elements such as e.g. linked lists, locks, reference counters. Therefore
+ * constructors are expected to 1) never fail and 2) not need any
+ * user-provided data. The first constraint implies that object construction
+ * never performs dynamic resource allocation, which also means there is no
+ * need for destructors.
+ */
+typedef void (*kmem_cache_ctor_t)(void *obj);
+
+/*
+ * Types for slab allocation/free functions.
+ *
+ * All addresses and sizes must be page-aligned.
+ */
+typedef vm_offset_t (*kmem_slab_alloc_fn_t)(vm_size_t);
+typedef void (*kmem_slab_free_fn_t)(vm_offset_t, vm_size_t);
+
+/*
+ * Cache name buffer size.
+ */
+#define KMEM_CACHE_NAME_SIZE 32
+
+/*
+ * Cache of objects.
+ *
+ * Locking order : cpu_pool -> cache. CPU pools locking is ordered by CPU ID.
+ *
+ * The partial slabs list is sorted by slab references. Slabs with a high
+ * number of references are placed first on the list to reduce fragmentation.
+ * Sorting occurs at insertion/removal of buffers in a slab. As the list
+ * is maintained sorted, and the number of references only changes by one,
+ * this is a very cheap operation in the average case and the worst (linear)
+ * case is very unlikely.
+ */
+struct kmem_cache {
+#if SLAB_USE_CPU_POOLS
+ /* CPU pool layer */
+ struct kmem_cpu_pool cpu_pools[NCPUS];
+ struct kmem_cpu_pool_type *cpu_pool_type;
+#endif /* SLAB_USE_CPU_POOLS */
+
+ /* Slab layer */
+ simple_lock_data_t lock;
+ struct list node; /* Cache list linkage */
+ struct list partial_slabs;
+ struct list free_slabs;
+ struct rbtree active_slabs;
+ int flags;
+ size_t obj_size; /* User-provided size */
+ size_t align;
+ size_t buf_size; /* Aligned object size */
+ size_t bufctl_dist; /* Distance from buffer to bufctl */
+ size_t slab_size;
+ size_t color;
+ size_t color_max;
+ unsigned long bufs_per_slab;
+ unsigned long nr_objs; /* Number of allocated objects */
+ unsigned long nr_bufs; /* Total number of buffers */
+ unsigned long nr_slabs;
+ unsigned long nr_free_slabs;
+ kmem_cache_ctor_t ctor;
+ kmem_slab_alloc_fn_t slab_alloc_fn;
+ kmem_slab_free_fn_t slab_free_fn;
+ char name[KMEM_CACHE_NAME_SIZE];
+ size_t buftag_dist; /* Distance from buffer to buftag */
+ size_t redzone_pad; /* Bytes from end of object to redzone word */
+};
+
+/*
+ * Mach-style declarations for struct kmem_cache.
+ */
+typedef struct kmem_cache *kmem_cache_t;
+#define KMEM_CACHE_NULL ((kmem_cache_t) 0)
+
+/*
+ * VM submap for slab allocations.
+ */
+extern vm_map_t kmem_map;
+
+/*
+ * Cache initialization flags.
+ */
+#define KMEM_CACHE_NOCPUPOOL 0x1 /* Don't use the per-cpu pools */
+#define KMEM_CACHE_NOOFFSLAB 0x2 /* Don't allocate external slab data */
+#define KMEM_CACHE_NORECLAIM 0x4 /* Never give slabs back to their source,
+ implies KMEM_CACHE_NOOFFSLAB */
+#define KMEM_CACHE_VERIFY 0x8 /* Use debugging facilities */
+
+/*
+ * Initialize a cache.
+ */
+void kmem_cache_init(struct kmem_cache *cache, const char *name,
+ size_t obj_size, size_t align, kmem_cache_ctor_t ctor,
+ kmem_slab_alloc_fn_t slab_alloc_fn,
+ kmem_slab_free_fn_t slab_free_fn, int flags);
+
+/*
+ * Allocate an object from a cache.
+ */
+vm_offset_t kmem_cache_alloc(struct kmem_cache *cache);
+
+/*
+ * Release an object to its cache.
+ */
+void kmem_cache_free(struct kmem_cache *cache, vm_offset_t obj);
+
+/*
+ * Initialize the memory allocator module.
+ */
+void slab_bootstrap(void);
+void slab_init(void);
+
+/*
+ * Release free slabs to the VM system.
+ */
+void slab_collect(void);
+
+#endif /* _KERN_SLAB_H */
diff --git a/kern/zalloc.c b/kern/zalloc.c
deleted file mode 100644
index 43836a65..00000000
--- a/kern/zalloc.c
+++ /dev/null
@@ -1,1007 +0,0 @@
-/*
- * Mach Operating System
- * Copyright (c) 1993-1987 Carnegie Mellon University.
- * Copyright (c) 1993,1994 The University of Utah and
- * the Computer Systems Laboratory (CSL).
- * All rights reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF
- * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY
- * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF
- * THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
- * School of Computer Science
- * Carnegie Mellon University
- * Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- * File: kern/zalloc.c
- * Author: Avadis Tevanian, Jr.
- *
- * Zone-based memory allocator. A zone is a collection of fixed size
- * data blocks for which quick allocation/deallocation is possible.
- */
-
-#include <string.h>
-
-#include <kern/debug.h>
-#include <kern/macro_help.h>
-#include <kern/printf.h>
-#include <kern/mach_clock.h>
-#include <kern/sched.h>
-#include <kern/zalloc.h>
-#include <mach/vm_param.h>
-#include <vm/vm_kern.h>
-#include <machine/machspl.h>
-
-#if MACH_DEBUG
-#include <mach/kern_return.h>
-#include <mach/machine/vm_types.h>
-#include <mach_debug/zone_info.h>
-#include <kern/host.h>
-#include <vm/vm_map.h>
-#include <vm/vm_user.h>
-#include <vm/vm_kern.h>
-#endif
-
-#define ADD_TO_ZONE(zone, element) \
-MACRO_BEGIN \
- *((vm_offset_t *)(element)) = (zone)->free_elements; \
- (zone)->free_elements = (vm_offset_t) (element); \
- zone_count_down(zone); \
-MACRO_END
-
-#define REMOVE_FROM_ZONE(zone, ret, type) \
-MACRO_BEGIN \
- (ret) = (type) (zone)->free_elements; \
- if ((ret) != (type) 0) { \
- zone_count_up(zone); \
- (zone)->free_elements = *((vm_offset_t *)(ret)); \
- } \
-MACRO_END
-
-#define ALIGN_SIZE_UP(size, align) \
-((size) = (((size) + ((align) - 1)) & ~((align) - 1)))
-
-/*
- * Support for garbage collection of unused zone pages:
- */
-
-struct zone_page_table_entry {
- struct zone_page_table_entry *next;
- short in_free_list;
- short alloc_count;
-};
-
-extern struct zone_page_table_entry * zone_page_table;
-extern vm_offset_t zone_map_min_address;
-
-#define lock_zone_page_table() simple_lock(&zone_page_table_lock)
-#define unlock_zone_page_table() simple_unlock(&zone_page_table_lock)
-
-#define zone_page(addr) \
- (&(zone_page_table[(atop(((vm_offset_t)addr) - zone_map_min_address))]))
-
-
-extern void zone_page_alloc(vm_offset_t, vm_size_t);
-extern void zone_page_dealloc(vm_offset_t, vm_size_t);
-extern void zone_page_in_use(vm_offset_t, vm_size_t);
-extern void zone_page_free(vm_offset_t, vm_size_t);
-
-zone_t zone_zone; /* this is the zone containing other zones */
-
-boolean_t zone_ignore_overflow = TRUE;
-
-vm_map_t zone_map = VM_MAP_NULL;
-vm_size_t zone_map_size = 64 * 1024 * 1024;
-
-/*
- * The VM system gives us an initial chunk of memory.
- * It has to be big enough to allocate the zone_zone
- * and some initial kernel data structures, like kernel maps.
- * It is advantageous to make it bigger than really necessary,
- * because this memory is more efficient than normal kernel
- * virtual memory. (It doesn't have vm_page structures backing it
- * and it may have other machine-dependent advantages.)
- * So for best performance, zdata_size should approximate
- * the amount of memory you expect the zone system to consume.
- */
-
-vm_offset_t zdata;
-vm_size_t zdata_size = 420 * 1024;
-
-#define zone_lock(zone) \
-MACRO_BEGIN \
- if (zone->type & ZONE_PAGEABLE) { \
- lock_write(&zone->complex_lock); \
- } else { \
- simple_lock(&zone->lock); \
- } \
-MACRO_END
-
-#define zone_unlock(zone) \
-MACRO_BEGIN \
- if (zone->type & ZONE_PAGEABLE) { \
- lock_done(&zone->complex_lock); \
- } else { \
- simple_unlock(&zone->lock); \
- } \
-MACRO_END
-
-#define zone_lock_init(zone) \
-MACRO_BEGIN \
- if (zone->type & ZONE_PAGEABLE) { \
- lock_init(&zone->complex_lock, TRUE); \
- } else { \
- simple_lock_init(&zone->lock); \
- } \
-MACRO_END
-
-static vm_offset_t zget_space(vm_offset_t size, vm_size_t align);
-
-decl_simple_lock_data(,zget_space_lock)
-vm_offset_t zalloc_next_space;
-vm_offset_t zalloc_end_of_space;
-vm_size_t zalloc_wasted_space;
-
-/*
- * Garbage collection map information
- */
-decl_simple_lock_data(,zone_page_table_lock)
-struct zone_page_table_entry * zone_page_table;
-vm_offset_t zone_map_min_address;
-vm_offset_t zone_map_max_address;
-int zone_pages;
-
-extern void zone_page_init(vm_offset_t, vm_size_t, int);
-
-#define ZONE_PAGE_USED 0
-#define ZONE_PAGE_UNUSED -1
-
-
-/*
- * Protects first_zone, last_zone, num_zones,
- * and the next_zone field of zones.
- */
-decl_simple_lock_data(,all_zones_lock)
-zone_t first_zone;
-zone_t *last_zone;
-int num_zones;
-
-/*
- * zinit initializes a new zone. The zone data structures themselves
- * are stored in a zone, which is initially a static structure that
- * is initialized by zone_init.
- */
-zone_t zinit(size, align, max, alloc, memtype, name)
- vm_size_t size; /* the size of an element */
- vm_size_t align; /* alignment of elements */
- vm_size_t max; /* maximum memory to use */
- vm_size_t alloc; /* allocation size */
- unsigned int memtype; /* flags specifying type of memory */
- char *name; /* a name for the zone */
-{
- register zone_t z;
-
- if (zone_zone == ZONE_NULL)
- z = (zone_t) zget_space(sizeof(struct zone), 0);
- else
- z = (zone_t) zalloc(zone_zone);
- if (z == ZONE_NULL)
- panic("zinit");
- if (alloc == 0)
- alloc = PAGE_SIZE;
-
- if (size == 0)
- size = sizeof(z->free_elements);
- /*
- * Round off all the parameters appropriately.
- */
-
- if ((max = round_page(max)) < (alloc = round_page(alloc)))
- max = alloc;
-
- if (align > 0) {
- if (PAGE_SIZE % align || align % sizeof(z->free_elements))
- panic("zinit");
- ALIGN_SIZE_UP(size, align);
- }
-
- z->free_elements = 0;
- z->cur_size = 0;
- z->max_size = max;
- z->elem_size = ((size-1) + sizeof(z->free_elements)) -
- ((size-1) % sizeof(z->free_elements));
- z->align = align;
-
- z->alloc_size = alloc;
- z->type = memtype;
- z->zone_name = name;
-#ifdef ZONE_COUNT
- z->count = 0;
-#endif
- z->doing_alloc = FALSE;
- zone_lock_init(z);
-
- /*
- * Add the zone to the all-zones list.
- */
-
- z->next_zone = ZONE_NULL;
- simple_lock(&all_zones_lock);
- *last_zone = z;
- last_zone = &z->next_zone;
- num_zones++;
- simple_unlock(&all_zones_lock);
-
- return(z);
-}
-
-/*
- * Cram the given memory into the specified zone.
- */
-void zcram(zone_t zone, vm_offset_t newmem, vm_size_t size)
-{
- register vm_size_t elem_size;
-
- if (newmem == (vm_offset_t) 0) {
- panic("zcram - memory at zero");
- }
- elem_size = zone->elem_size;
-
- zone_lock(zone);
- while (size >= elem_size) {
- ADD_TO_ZONE(zone, newmem);
- zone_page_alloc(newmem, elem_size);
- zone_count_up(zone); /* compensate for ADD_TO_ZONE */
- size -= elem_size;
- newmem += elem_size;
- zone->cur_size += elem_size;
- }
- zone_unlock(zone);
-}
-
-/*
- * Contiguous space allocator for non-paged zones. Allocates "size" amount
- * of memory from zone_map.
- */
-
-static vm_offset_t zget_space(vm_offset_t size, vm_size_t align)
-{
- vm_offset_t new_space = 0;
- vm_offset_t result;
- vm_size_t space_to_add = 0; /*'=0' to quiet gcc warnings */
-
- simple_lock(&zget_space_lock);
- if (align > 0) {
- assert(align < PAGE_SIZE);
- ALIGN_SIZE_UP(zalloc_next_space, align);
- }
-
- while ((zalloc_next_space + size) > zalloc_end_of_space) {
- /*
- * Add at least one page to allocation area.
- */
-
- space_to_add = round_page(size);
-
- if (new_space == 0) {
- /*
- * Memory cannot be wired down while holding
- * any locks that the pageout daemon might
- * need to free up pages. [Making the zget_space
- * lock a complex lock does not help in this
- * regard.]
- *
- * Unlock and allocate memory. Because several
- * threads might try to do this at once, don't
- * use the memory before checking for available
- * space again.
- */
-
- simple_unlock(&zget_space_lock);
-
- if (kmem_alloc_wired(zone_map,
- &new_space, space_to_add)
- != KERN_SUCCESS)
- return(0);
- zone_page_init(new_space, space_to_add,
- ZONE_PAGE_USED);
- simple_lock(&zget_space_lock);
- if (align > 0)
- ALIGN_SIZE_UP(zalloc_next_space, align);
- continue;
- }
-
-
- /*
- * Memory was allocated in a previous iteration.
- *
- * Check whether the new region is contiguous
- * with the old one.
- */
-
- if (new_space != zalloc_end_of_space) {
- /*
- * Throw away the remainder of the
- * old space, and start a new one.
- */
- zalloc_wasted_space +=
- zalloc_end_of_space - zalloc_next_space;
- zalloc_next_space = new_space;
- }
-
- zalloc_end_of_space = new_space + space_to_add;
-
- new_space = 0;
- }
- result = zalloc_next_space;
- zalloc_next_space += size;
- simple_unlock(&zget_space_lock);
-
- if (new_space != 0)
- kmem_free(zone_map, new_space, space_to_add);
-
- return(result);
-}
-
-
-/*
- * Initialize the "zone of zones" which uses fixed memory allocated
- * earlier in memory initialization. zone_bootstrap is called
- * before zone_init.
- */
-void zone_bootstrap(void)
-{
- simple_lock_init(&all_zones_lock);
- first_zone = ZONE_NULL;
- last_zone = &first_zone;
- num_zones = 0;
-
- simple_lock_init(&zget_space_lock);
- zalloc_next_space = zdata;
- zalloc_end_of_space = zdata + zdata_size;
- zalloc_wasted_space = 0;
-
- zone_zone = ZONE_NULL;
- zone_zone = zinit(sizeof(struct zone), 0, 128 * sizeof(struct zone),
- sizeof(struct zone), 0, "zones");
-}
-
-void zone_init(void)
-{
- vm_offset_t zone_min;
- vm_offset_t zone_max;
-
- vm_size_t zone_table_size;
-
- zone_map = kmem_suballoc(kernel_map, &zone_min, &zone_max,
- zone_map_size, FALSE);
-
- /*
- * Setup garbage collection information:
- */
-
- zone_table_size = atop(zone_max - zone_min) *
- sizeof(struct zone_page_table_entry);
- if (kmem_alloc_wired(zone_map, (vm_offset_t *) &zone_page_table,
- zone_table_size) != KERN_SUCCESS)
- panic("zone_init");
- zone_min = (vm_offset_t)zone_page_table + round_page(zone_table_size);
- zone_pages = atop(zone_max - zone_min);
- zone_map_min_address = zone_min;
- zone_map_max_address = zone_max;
- simple_lock_init(&zone_page_table_lock);
- zone_page_init(zone_min, zone_max - zone_min, ZONE_PAGE_UNUSED);
-}
-
-
-/*
- * zalloc returns an element from the specified zone.
- */
-vm_offset_t zalloc(zone_t zone)
-{
- vm_offset_t addr;
-
- if (zone == ZONE_NULL)
- panic ("zalloc: null zone");
-
- check_simple_locks();
-
- zone_lock(zone);
- REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
- while (addr == 0) {
- /*
- * If nothing was there, try to get more
- */
- if (zone->doing_alloc) {
- /*
- * Someone is allocating memory for this zone.
- * Wait for it to show up, then try again.
- */
- assert_wait((event_t)&zone->doing_alloc, TRUE);
- /* XXX say wakeup needed */
- zone_unlock(zone);
- thread_block((void (*)()) 0);
- zone_lock(zone);
- }
- else {
- if ((zone->cur_size + (zone->type & ZONE_PAGEABLE ?
- zone->alloc_size : zone->elem_size)) >
- zone->max_size) {
- if (zone->type & ZONE_EXHAUSTIBLE)
- break;
- /*
- * Printf calls logwakeup, which calls
- * select_wakeup which will do a zfree
- * (which tries to take the select_zone
- * lock... Hang. Release the lock now
- * so it can be taken again later.
- * NOTE: this used to be specific to
- * the select_zone, but for
- * cleanliness, we just unlock all
- * zones before this.
- */
- if (!(zone->type & ZONE_FIXED)) {
- /*
- * We're willing to overflow certain
- * zones, but not without complaining.
- *
- * This is best used in conjunction
- * with the collecatable flag. What we
- * want is an assurance we can get the
- * memory back, assuming there's no
- * leak.
- */
- zone->max_size += (zone->max_size >> 1);
- } else if (!zone_ignore_overflow) {
- zone_unlock(zone);
- printf("zone \"%s\" empty.\n",
- zone->zone_name);
- panic("zalloc: zone %s exhausted",
- zone->zone_name);
- }
- }
-
- if (zone->type & ZONE_PAGEABLE)
- zone->doing_alloc = TRUE;
- zone_unlock(zone);
-
- if (zone->type & ZONE_PAGEABLE) {
- if (kmem_alloc_pageable(zone_map, &addr,
- zone->alloc_size)
- != KERN_SUCCESS)
- panic("zalloc: no pageable memory for zone %s",
- zone->zone_name);
- zcram(zone, addr, zone->alloc_size);
- zone_lock(zone);
- zone->doing_alloc = FALSE;
- /* XXX check before doing this */
- thread_wakeup((event_t)&zone->doing_alloc);
-
- REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
- } else if (zone->type & ZONE_COLLECTABLE) {
- if (kmem_alloc_wired(zone_map,
- &addr, zone->alloc_size)
- != KERN_SUCCESS)
- panic("zalloc: no wired memory for zone %s",
- zone->zone_name);
- zone_page_init(addr, zone->alloc_size,
- ZONE_PAGE_USED);
- zcram(zone, addr, zone->alloc_size);
- zone_lock(zone);
- REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
- } else {
- addr = zget_space(zone->elem_size, zone->align);
- if (addr == 0)
- panic("zalloc: no memory for zone %s",
- zone->zone_name);
-
- zone_lock(zone);
- zone_count_up(zone);
- zone->cur_size += zone->elem_size;
- zone_unlock(zone);
- zone_page_alloc(addr, zone->elem_size);
- return(addr);
- }
- }
- }
-
- zone_unlock(zone);
- return(addr);
-}
-
-
-/*
- * zget returns an element from the specified zone
- * and immediately returns nothing if there is nothing there.
- *
- * This form should be used when you can not block (like when
- * processing an interrupt).
- */
-vm_offset_t zget(zone_t zone)
-{
- register vm_offset_t addr;
-
- if (zone == ZONE_NULL)
- panic ("zalloc: null zone");
-
- zone_lock(zone);
- REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
- zone_unlock(zone);
-
- return(addr);
-}
-
-boolean_t zone_check = FALSE;
-
-void zfree(zone_t zone, vm_offset_t elem)
-{
- zone_lock(zone);
- if (zone_check) {
- vm_offset_t this;
-
- /* check the zone's consistency */
-
- for (this = zone->free_elements;
- this != 0;
- this = * (vm_offset_t *) this)
- if (this == elem)
- panic("zfree");
- }
- ADD_TO_ZONE(zone, elem);
- zone_unlock(zone);
-}
-
-/*
- * Zone garbage collection subroutines
- *
- * These routines have in common the modification of entries in the
- * zone_page_table. The latter contains one entry for every page
- * in the zone_map.
- *
- * For each page table entry in the given range:
- *
- * zone_page_in_use - decrements in_free_list
- * zone_page_free - increments in_free_list
- * zone_page_init - initializes in_free_list and alloc_count
- * zone_page_alloc - increments alloc_count
- * zone_page_dealloc - decrements alloc_count
- * zone_add_free_page_list - adds the page to the free list
- *
- * Two counts are maintained for each page, the in_free_list count and
- * alloc_count. The alloc_count is how many zone elements have been
- * allocated from a page. (Note that the page could contain elements
- * that span page boundaries. The count includes these elements so
- * one element may be counted in two pages.) In_free_list is a count
- * of how many zone elements are currently free. If in_free_list is
- * equal to alloc_count then the page is eligible for garbage
- * collection.
- *
- * Alloc_count and in_free_list are initialized to the correct values
- * for a particular zone when a page is zcram'ed into a zone. Subsequent
- * gets and frees of zone elements will call zone_page_in_use and
- * zone_page_free which modify the in_free_list count. When the zones
- * garbage collector runs it will walk through a zones free element list,
- * remove the elements that reside on collectable pages, and use
- * zone_add_free_page_list to create a list of pages to be collected.
- */
-
-void zone_page_in_use(addr, size)
-vm_offset_t addr;
-vm_size_t size;
-{
- int i, j;
- if ((addr < zone_map_min_address) ||
- (addr+size > zone_map_max_address)) return;
- i = atop(addr-zone_map_min_address);
- j = atop((addr+size-1) - zone_map_min_address);
- lock_zone_page_table();
- for (; i <= j; i++) {
- zone_page_table[i].in_free_list--;
- }
- unlock_zone_page_table();
-}
-
-void zone_page_free(addr, size)
-vm_offset_t addr;
-vm_size_t size;
-{
- int i, j;
- if ((addr < zone_map_min_address) ||
- (addr+size > zone_map_max_address)) return;
- i = atop(addr-zone_map_min_address);
- j = atop((addr+size-1) - zone_map_min_address);
- lock_zone_page_table();
- for (; i <= j; i++) {
- /* Set in_free_list to (ZONE_PAGE_USED + 1) if
- * it was previously set to ZONE_PAGE_UNUSED.
- */
- if (zone_page_table[i].in_free_list == ZONE_PAGE_UNUSED) {
- zone_page_table[i].in_free_list = 1;
- } else {
- zone_page_table[i].in_free_list++;
- }
- }
- unlock_zone_page_table();
-}
-
-void zone_page_init(addr, size, value)
-
-vm_offset_t addr;
-vm_size_t size;
-int value;
-{
- int i, j;
- if ((addr < zone_map_min_address) ||
- (addr+size > zone_map_max_address)) return;
- i = atop(addr-zone_map_min_address);
- j = atop((addr+size-1) - zone_map_min_address);
- lock_zone_page_table();
- for (; i <= j; i++) {
- zone_page_table[i].alloc_count = value;
- zone_page_table[i].in_free_list = 0;
- }
- unlock_zone_page_table();
-}
-
-void zone_page_alloc(addr, size)
-vm_offset_t addr;
-vm_size_t size;
-{
- int i, j;
- if ((addr < zone_map_min_address) ||
- (addr+size > zone_map_max_address)) return;
- i = atop(addr-zone_map_min_address);
- j = atop((addr+size-1) - zone_map_min_address);
- lock_zone_page_table();
- for (; i <= j; i++) {
- /* Set alloc_count to (ZONE_PAGE_USED + 1) if
- * it was previously set to ZONE_PAGE_UNUSED.
- */
- if (zone_page_table[i].alloc_count == ZONE_PAGE_UNUSED) {
- zone_page_table[i].alloc_count = 1;
- } else {
- zone_page_table[i].alloc_count++;
- }
- }
- unlock_zone_page_table();
-}
-
-void zone_page_dealloc(addr, size)
-vm_offset_t addr;
-vm_size_t size;
-{
- int i, j;
- if ((addr < zone_map_min_address) ||
- (addr+size > zone_map_max_address)) return;
- i = atop(addr-zone_map_min_address);
- j = atop((addr+size-1) - zone_map_min_address);
- lock_zone_page_table();
- for (; i <= j; i++) {
- zone_page_table[i].alloc_count--;
- }
- unlock_zone_page_table();
-}
-
-void
-zone_add_free_page_list(free_list, addr, size)
- struct zone_page_table_entry **free_list;
- vm_offset_t addr;
- vm_size_t size;
-{
- int i, j;
- if ((addr < zone_map_min_address) ||
- (addr+size > zone_map_max_address)) return;
- i = atop(addr-zone_map_min_address);
- j = atop((addr+size-1) - zone_map_min_address);
- lock_zone_page_table();
- for (; i <= j; i++) {
- if (zone_page_table[i].alloc_count == 0) {
- zone_page_table[i].next = *free_list;
- *free_list = &zone_page_table[i];
- zone_page_table[i].alloc_count = ZONE_PAGE_UNUSED;
- zone_page_table[i].in_free_list = 0;
- }
- }
- unlock_zone_page_table();
-}
-
-
-/* This is used for walking through a zone's free element list.
- */
-struct zone_free_entry {
- struct zone_free_entry * next;
-};
-
-
-/* Zone garbage collection
- *
- * zone_gc will walk through all the free elements in all the
- * zones that are marked collectable looking for reclaimable
- * pages. zone_gc is called by consider_zone_gc when the system
- * begins to run out of memory.
- */
-static void zone_gc(void)
-{
- int max_zones;
- zone_t z;
- int i;
- register spl_t s;
- struct zone_page_table_entry *freep;
- struct zone_page_table_entry *zone_free_page_list;
-
- simple_lock(&all_zones_lock);
- max_zones = num_zones;
- z = first_zone;
- simple_unlock(&all_zones_lock);
-
- zone_free_page_list = (struct zone_page_table_entry *) 0;
-
- for (i = 0; i < max_zones; i++) {
- struct zone_free_entry * last;
- struct zone_free_entry * elt;
- assert(z != ZONE_NULL);
- /* run this at splhigh so that interupt routines that use zones
- can not interupt while their zone is locked */
- s=splhigh();
- zone_lock(z);
-
- if ((z->type & (ZONE_PAGEABLE|ZONE_COLLECTABLE)) == ZONE_COLLECTABLE) {
-
- /* Count the free elements in each page. This loop
- * requires that all in_free_list entries are zero.
- */
- elt = (struct zone_free_entry *)(z->free_elements);
- while ((elt != (struct zone_free_entry *)0)) {
- zone_page_free((vm_offset_t)elt, z->elem_size);
- elt = elt->next;
- }
-
- /* Now determine which elements should be removed
- * from the free list and, after all the elements
- * on a page have been removed, add the element's
- * page to a list of pages to be freed.
- */
- elt = (struct zone_free_entry *)(z->free_elements);
- last = elt;
- while ((elt != (struct zone_free_entry *)0)) {
- if (((vm_offset_t)elt>=zone_map_min_address)&&
- ((vm_offset_t)elt<=zone_map_max_address)&&
- (zone_page(elt)->in_free_list ==
- zone_page(elt)->alloc_count)) {
-
- z->cur_size -= z->elem_size;
- zone_page_in_use((vm_offset_t)elt, z->elem_size);
- zone_page_dealloc((vm_offset_t)elt, z->elem_size);
- if (zone_page(elt)->alloc_count == 0 ||
- zone_page(elt+(z->elem_size-1))->alloc_count==0) {
- zone_add_free_page_list(
- &zone_free_page_list,
- (vm_offset_t)elt, z->elem_size);
- }
-
-
- if (elt == last) {
- elt = elt->next;
- z->free_elements =(vm_offset_t)elt;
- last = elt;
- } else {
- last->next = elt->next;
- elt = elt->next;
- }
- } else {
- /* This element is not eligible for collection
- * so clear in_free_list in preparation for a
- * subsequent garbage collection pass.
- */
- if (((vm_offset_t)elt>=zone_map_min_address)&&
- ((vm_offset_t)elt<=zone_map_max_address)) {
- zone_page(elt)->in_free_list = 0;
- }
- last = elt;
- elt = elt->next;
- }
- }
- }
- zone_unlock(z);
- splx(s);
- simple_lock(&all_zones_lock);
- z = z->next_zone;
- simple_unlock(&all_zones_lock);
- }
-
- for (freep = zone_free_page_list; freep != 0; freep = freep->next) {
- vm_offset_t free_addr;
-
- free_addr = zone_map_min_address +
- PAGE_SIZE * (freep - zone_page_table);
-
- /* Hack Hack */
- /* Needed to make vm_map_delete's vm_map_clip_end always be
- * able to get an element without having to call zget_space and
- * hang because zone_map is already locked by vm_map_delete */
-
- extern zone_t vm_map_kentry_zone; /* zone for kernel entry structures */
- vm_offset_t entry1 = zalloc(vm_map_kentry_zone),
- entry2 = zalloc(vm_map_kentry_zone);
- zfree(vm_map_kentry_zone, entry1);
- zfree(vm_map_kentry_zone, entry2);
-
- kmem_free(zone_map, free_addr, PAGE_SIZE);
- }
-}
-
-boolean_t zone_gc_allowed = TRUE;
-unsigned zone_gc_last_tick = 0;
-unsigned zone_gc_max_rate = 0; /* in ticks */
-
-/*
- * consider_zone_gc:
- *
- * Called by the pageout daemon when the system needs more free pages.
- */
-
-void
-consider_zone_gc(void)
-{
- /*
- * By default, don't attempt zone GC more frequently
- * than once a second.
- */
-
- if (zone_gc_max_rate == 0)
- zone_gc_max_rate = hz;
-
- if (zone_gc_allowed &&
- (sched_tick > (zone_gc_last_tick + zone_gc_max_rate))) {
- zone_gc_last_tick = sched_tick;
- zone_gc();
- }
-}
-
-#if MACH_DEBUG
-kern_return_t host_zone_info(host, namesp, namesCntp, infop, infoCntp)
- host_t host;
- zone_name_array_t *namesp;
- unsigned int *namesCntp;
- zone_info_array_t *infop;
- unsigned int *infoCntp;
-{
- zone_name_t *names;
- vm_offset_t names_addr;
- vm_size_t names_size = 0; /*'=0' to quiet gcc warnings */
- zone_info_t *info;
- vm_offset_t info_addr;
- vm_size_t info_size = 0; /*'=0' to quiet gcc warnings */
- unsigned int max_zones, i;
- zone_t z;
- kern_return_t kr;
-
- if (host == HOST_NULL)
- return KERN_INVALID_HOST;
-
- /*
- * We assume that zones aren't freed once allocated.
- * We won't pick up any zones that are allocated later.
- */
-
- simple_lock(&all_zones_lock);
- max_zones = num_zones;
- z = first_zone;
- simple_unlock(&all_zones_lock);
-
- if (max_zones <= *namesCntp) {
- /* use in-line memory */
-
- names = *namesp;
- } else {
- names_size = round_page(max_zones * sizeof *names);
- kr = kmem_alloc_pageable(ipc_kernel_map,
- &names_addr, names_size);
- if (kr != KERN_SUCCESS)
- return kr;
-
- names = (zone_name_t *) names_addr;
- }
-
- if (max_zones <= *infoCntp) {
- /* use in-line memory */
-
- info = *infop;
- } else {
- info_size = round_page(max_zones * sizeof *info);
- kr = kmem_alloc_pageable(ipc_kernel_map,
- &info_addr, info_size);
- if (kr != KERN_SUCCESS) {
- if (names != *namesp)
- kmem_free(ipc_kernel_map,
- names_addr, names_size);
- return kr;
- }
-
- info = (zone_info_t *) info_addr;
- }
-
- for (i = 0; i < max_zones; i++) {
- zone_name_t *zn = &names[i];
- zone_info_t *zi = &info[i];
- struct zone zcopy;
-
- assert(z != ZONE_NULL);
-
- zone_lock(z);
- zcopy = *z;
- zone_unlock(z);
-
- simple_lock(&all_zones_lock);
- z = z->next_zone;
- simple_unlock(&all_zones_lock);
-
- /* assuming here the name data is static */
- (void) strncpy(zn->zn_name, zcopy.zone_name,
- sizeof zn->zn_name);
-
-#ifdef ZONE_COUNT
- zi->zi_count = zcopy.count;
-#else
- zi->zi_count = 0;
-#endif
- zi->zi_cur_size = zcopy.cur_size;
- zi->zi_max_size = zcopy.max_size;
- zi->zi_elem_size = zcopy.elem_size;
- zi->zi_alloc_size = zcopy.alloc_size;
- zi->zi_pageable = (zcopy.type & ZONE_PAGEABLE) != 0;
- zi->zi_exhaustible = (zcopy.type & ZONE_EXHAUSTIBLE) != 0;
- zi->zi_collectable = (zcopy.type & ZONE_COLLECTABLE) != 0;
- }
-
- if (names != *namesp) {
- vm_size_t used;
- vm_map_copy_t copy;
-
- used = max_zones * sizeof *names;
-
- if (used != names_size)
- memset((char *) (names_addr + used), 0, names_size - used);
-
- kr = vm_map_copyin(ipc_kernel_map, names_addr, names_size,
- TRUE, &copy);
- assert(kr == KERN_SUCCESS);
-
- *namesp = (zone_name_t *) copy;
- }
- *namesCntp = max_zones;
-
- if (info != *infop) {
- vm_size_t used;
- vm_map_copy_t copy;
-
- used = max_zones * sizeof *info;
-
- if (used != info_size)
- memset((char *) (info_addr + used), 0, info_size - used);
-
- kr = vm_map_copyin(ipc_kernel_map, info_addr, info_size,
- TRUE, &copy);
- assert(kr == KERN_SUCCESS);
-
- *infop = (zone_info_t *) copy;
- }
- *infoCntp = max_zones;
-
- return KERN_SUCCESS;
-}
-#endif /* MACH_DEBUG */
diff --git a/kern/zalloc.h b/kern/zalloc.h
deleted file mode 100644
index f1a1850b..00000000
--- a/kern/zalloc.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University.
- * Copyright (c) 1993,1994 The University of Utah and
- * the Computer Systems Laboratory (CSL).
- * All rights reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF
- * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY
- * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF
- * THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
- * School of Computer Science
- * Carnegie Mellon University
- * Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- * File: zalloc.h
- * Author: Avadis Tevanian, Jr.
- * Date: 1985
- *
- */
-
-#ifndef _KERN_ZALLOC_H_
-#define _KERN_ZALLOC_H_
-
-#include <mach/machine/vm_types.h>
-#include <kern/macro_help.h>
-#include <kern/lock.h>
-#include <kern/queue.h>
-#include <machine/zalloc.h>
-
-/*
- * A zone is a collection of fixed size blocks for which there
- * is fast allocation/deallocation access. Kernel routines can
- * use zones to manage data structures dynamically, creating a zone
- * for each type of data structure to be managed.
- *
- */
-
-struct zone {
- decl_simple_lock_data(,lock) /* generic lock */
-#ifdef ZONE_COUNT
- int count; /* Number of elements used now */
-#endif
- vm_offset_t free_elements;
- vm_size_t cur_size; /* current memory utilization */
- vm_size_t max_size; /* how large can this zone grow */
- vm_size_t elem_size; /* size of an element */
- vm_size_t align; /* alignment of elements */
- vm_size_t alloc_size; /* size used for more memory */
- boolean_t doing_alloc; /* is zone expanding now? */
- char *zone_name; /* a name for the zone */
- unsigned int type; /* type of memory */
- lock_data_t complex_lock; /* Lock for pageable zones */
- struct zone *next_zone; /* Link for all-zones list */
-};
-typedef struct zone *zone_t;
-
-#define ZONE_NULL ((zone_t) 0)
-
-/* Exported to everyone */
-zone_t zinit(vm_size_t size, vm_size_t align, vm_size_t max,
- vm_size_t alloc, unsigned int memtype, char *name);
-vm_offset_t zalloc(zone_t zone);
-vm_offset_t zget(zone_t zone);
-void zfree(zone_t zone, vm_offset_t elem);
-void zcram(zone_t zone, vm_offset_t newmem, vm_size_t size);
-
-/* Exported only to vm/vm_init.c */
-void zone_bootstrap(void);
-void zone_init(void);
-
-/* Exported only to vm/vm_pageout.c */
-void consider_zone_gc(void);
-
-
-/* Memory type bits for zones */
-#define ZONE_PAGEABLE 0x00000001
-#define ZONE_COLLECTABLE 0x00000002 /* Garbage-collect this zone when memory runs low */
-#define ZONE_EXHAUSTIBLE 0x00000004 /* zalloc() on this zone is allowed to fail */
-#define ZONE_FIXED 0x00000008 /* Panic if zone is exhausted (XXX) */
-
-/* Machine-dependent code can provide additional memory types. */
-#define ZONE_MACHINE_TYPES 0xffff0000
-
-
-#ifdef ZONE_COUNT
-#define zone_count_up(zone) ((zone)->count++)
-#define zone_count_down(zone) ((zone)->count--)
-#else
-#define zone_count_up(zone)
-#define zone_count_down(zone)
-#endif
-
-
-
-/* These quick inline versions only work for small, nonpageable zones (currently). */
-
-static __inline vm_offset_t ZALLOC(zone_t zone)
-{
- simple_lock(&zone->lock);
- if (zone->free_elements == 0) {
- simple_unlock(&zone->lock);
- return zalloc(zone);
- } else {
- vm_offset_t element = zone->free_elements;
- zone->free_elements = *((vm_offset_t *)(element));
- zone_count_up(zone);
- simple_unlock(&zone->lock);
- return element;
- }
-}
-
-static __inline void ZFREE(zone_t zone, vm_offset_t element)
-{
- *((vm_offset_t *)(element)) = zone->free_elements;
- zone->free_elements = (vm_offset_t) (element);
- zone_count_down(zone);
-}
-
-
-
-#endif /* _KERN_ZALLOC_H_ */