非均衡访存模型 NUMA(Non-Uniform Memory Access)。在这种模式下,内存不是一整块。每个 CPU 都有自己的本地内存,CPU 访问本地内存不用过总线,因而速度要快很多,每个 CPU 和内存在一起,称为一个 NUMA 节点。但是,在本地内存不足的情况下,每个 CPU 都可以去另外的 NUMA 节点申请内存,这个时候访问延时就会比较长。
/* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */typedefstruct pglist_data {struct zone node_zones[MAX_NR_ZONES];struct zonelist node_zonelists[MAX_ZONELISTS];int nr_zones;struct page *node_mem_map; unsignedlong node_start_pfn;unsignedlong node_present_pages; /* total number of physical pages */unsignedlong node_spanned_pages; /* total size of physical page range, including holes */int node_id;wait_queue_head_t kswapd_wait;wait_queue_head_t pfmemalloc_wait;struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */int kswapd_order;enum zone_type kswapd_classzone_idx;int kswapd_failures; /* Number of 'reclaimed == 0' runs */......} pg_data_t;
冷热页区分per_cpu_pagest:如果一个页被加载到 CPU 高速缓存里面,这就是一个热页(Hot Page),CPU 读起来速度会快很多,如果没有就是冷页(Cold Page)。由于每个 CPU 都有自己的高速缓存,因而 per_cpu_pageset 也是每个 CPU 一个。
struct zone {......struct pglist_data *zone_pgdat;struct per_cpu_pageset __percpu *pageset;...... /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */unsignedlong zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/end(). Any reader who can't tolerant drift of * present_pages should get_online_mems() to get a stable value. */atomic_long_t managed_pages;unsignedlong spanned_pages;unsignedlong present_pages;constchar*name;......int initialized; /* Write-intensive fields used from the page allocator */ZONE_PADDING(_pad1_) /* free areas of different sizes */struct free_area free_area[MAX_ORDER]; /* zone flags, see below */unsignedlong flags; /* Primarily protects free_area */spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ZONE_PADDING(_pad2_) /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */unsignedlong percpu_drift_mark;......bool contiguous;ZONE_PADDING(_pad3_) /* Zone statistics */atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];} ____cacheline_internodealigned_in_smp;
struct page {unsignedlong flags; /* Atomic flags, some possibly updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. * WARNING: bit 0 of the first word is used for PageTail(). That * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */union {struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by * pgdat->lru_lock. Sometimes used as a generic list * by the page owner. */struct list_head lru; /* See page-flags.h for PAGE_MAPPING_FLAGS */struct address_space *mapping;pgoff_t index; /* Our offset within mapping. */ /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if PageSwapCache. * Indicates order in the buddy system if PageBuddy. */unsignedlong private; };struct { /* page_pool used by netstack */ /** * @dma_addr: might require a 64-bit value even on * 32-bit architectures. */dma_addr_t dma_addr; };struct { /* slab, slob and slub */union {struct list_head slab_list; /* uses lru */struct { /* Partial pages */struct page *next;...... }; };struct kmem_cache *slab_cache; /* not slob */ /* Double-word boundary */void*freelist; /* first free object */union {void*s_mem; /* slab: first object */unsignedlong counters; /* SLUB */struct { /* SLUB */unsigned inuse:16;unsigned objects:15;unsigned frozen:1; }; }; };......struct { /* ZONE_DEVICE pages */ /** @pgmap: Points to the hosting device page map. */struct dev_pagemap *pgmap;unsignedlong hmm_data;unsignedlong _zd_pad_1; /* uses mapping */ }; /** @rcu_head: You can use this to free a page by RCU. */struct rcu_head rcu_head; };......} _struct_page_alignment;
staticinlinestruct page *alloc_pages(gfp_t gfp_mask,unsignedint order){returnalloc_pages_current(gfp_mask, order);}/** * alloc_pages_current - Allocate pages. * * @gfp: * %GFP_USER user allocation, * %GFP_KERNEL kernel allocation, * %GFP_HIGHMEM highmem allocation, * %GFP_FS don't call back into a file system. * %GFP_ATOMIC don't sleep. * @order: Power of two of allocation size in pages. 0 is a single page. * * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. */struct page *alloc_pages_current(gfp_t gfp,unsigned order){struct mempolicy *pol =&default_policy;struct page *page;if (!in_interrupt()&&!(gfp & __GFP_THISNODE)) pol =get_task_policy(current); /* * No reference counting needed for current->mempolicy * nor system default_policy */if (pol->mode == MPOL_INTERLEAVE) page =alloc_page_interleave(gfp, order, interleave_nodes(pol));else page =__alloc_pages_nodemask(gfp, order, policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol));return page;}EXPORT_SYMBOL(alloc_pages_current);
/* * This is the 'heart' of the zoned buddy allocator. */struct page *__alloc_pages_nodemask(gfp_t gfp_mask,unsignedint order,int preferred_nid,nodemask_t*nodemask){struct page *page;unsignedint alloc_flags = ALLOC_WMARK_LOW;gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */struct alloc_context ac = { };...... gfp_mask &= gfp_allowed_mask; alloc_mask = gfp_mask;if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask,&ac,&alloc_mask,&alloc_flags))returnNULL;finalise_ac(gfp_mask,&ac); /* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ alloc_flags |=alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); /* First allocation attempt */ page =get_page_from_freelist(alloc_mask, order, alloc_flags,&ac);......}EXPORT_SYMBOL(__alloc_pages_nodemask);
/* * get_page_from_freelist goes through the zonelist trying to allocate * a page. */staticstruct page *get_page_from_freelist(gfp_t gfp_mask,unsignedint order,int alloc_flags,conststruct alloc_context *ac){ ......for_next_zone_zonelist_nodemask(zone, z,ac->zonelist,ac->high_zoneidx,ac->nodemask) {...... page =rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags,ac->migratetype);if (page) {prep_new_page(page, order, gfp_mask, alloc_flags); /* * If this is a high-order atomic allocation then check * if the pageblock should be reserved for the future */if (unlikely(order && (alloc_flags & ALLOC_HARDER)))reserve_highatomic_pageblock(page, zone, order);return page; } ......}
/* * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */static __always_inlinestruct page *__rmqueue_smallest(struct zone *zone,unsignedint order,int migratetype){unsignedint current_order;struct free_area *area;struct page *page; /* Find a page of the appropriate size in the preferred list */for (current_order = order; current_order < MAX_ORDER; ++current_order) { area =&(zone->free_area[current_order]); page =list_first_entry_or_null(&area->free_list[migratetype],struct page, lru);if (!page)continue;list_del(&page->lru);rmv_page_order(page);area->nr_free--;expand(zone, page, order, current_order, area, migratetype);set_pcppage_migratetype(page, migratetype);return page; }returnNULL;}
expand()函数中将页块区域前移,size右移即除2,然后使用list_add加入链表之中
/* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression * testing. Specifically, as large blocks of memory are subdivided, * the order in which smaller blocks are delivered depends on the order * they're subdivided in this function. This is the primary factor * influencing the order in which pages are delivered to the IO * subsystem according to empirical testing, and this is also justified * by considering the behavior of a buddy system containing a single * large block of memory acted on by a series of small allocations. * This behavior is a critical factor in sglist merging's success. * * -- nyc */staticinlinevoidexpand(struct zone *zone,struct page *page,int low,int high,struct free_area *area,int migratetype){unsignedlong size =1<< high;while (high > low) { area--; high--; size >>=1;VM_BUG_ON_PAGE(bad_range(zone,&page[size]),&page[size]); /* * Mark as guard pages (or page), that will allow to * merge back to allocator when buddy will be freed. * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */if (set_page_guard(zone,&page[size], high, migratetype))continue;list_add(&page[size].lru,&area->free_list[migratetype]);area->nr_free++;set_page_order(&page[size], high); }}
/* * Slab cache management. */struct kmem_cache {struct kmem_cache_cpu __percpu *cpu_slab; /* Used for retrieving partial slabs, etc. */slab_flags_t flags;unsignedlong min_partial;unsignedint size; /* The size of an object including metadata */unsignedint object_size;/* The size of an object without metadata */unsignedint offset; /* Free pointer offset */#ifdefCONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */unsignedint cpu_partial;#endifstruct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */struct kmem_cache_order_objects max;struct kmem_cache_order_objects min;gfp_t allocflags; /* gfp flags to use on each alloc */int refcount; /* Refcount for slab cache destroy */void (*ctor)(void*);unsignedint inuse; /* Offset to metadata */unsignedint align; /* Alignment */unsignedint red_left_pad; /* Left redzone padding size */constchar*name; /* Name (only for display!) */struct list_head list; /* List of slab caches */......unsignedint useroffset; /* Usercopy region offset */unsignedint usersize; /* Usercopy region size */struct kmem_cache_node *node[MAX_NUMNODES];};
struct kmem_cache_cpu {void**freelist; /* Pointer to next available object */unsignedlong tid; /* Globally unique transaction id */struct page *page; /* The slab from which we are allocating */#ifdefCONFIG_SLUB_CPU_PARTIALstruct page *partial; /* Partially allocated frozen slabs */#endif#ifdefCONFIG_SLUB_STATSunsigned stat[NR_SLUB_STAT_ITEMS];#endif};
/* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call * overhead for requests that can be satisfied on the fastpath. * * The fastpath works by first checking if the lockless freelist can be used. * If not then __slab_alloc is called for slow processing. * * Otherwise we can simply pick the next object from the lockless free list. */static __always_inline void*slab_alloc_node(struct kmem_cache *s,gfp_t gfpflags,int node,unsignedlong addr){void*object;struct kmem_cache_cpu *c;struct page *page;unsignedlong tid; s =slab_pre_alloc_hook(s, gfpflags);if (!s)returnNULL;redo: /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is * enabled. We may switch back and forth between cpus while * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. * * We should guarantee that tid and kmem_cache are retrieved on * the same cpu. It could be different if CONFIG_PREEMPT so we need * to check if it is matched or not. */do { tid =this_cpu_read(s->cpu_slab->tid); c =raw_cpu_ptr(s->cpu_slab); } while (IS_ENABLED(CONFIG_PREEMPT)&&unlikely(tid != READ_ONCE(c->tid))); /* * Irqless object alloc/free algorithm used here depends on sequence * of fetching cpu_slab's data. tid should be fetched before anything * on c to guarantee that object and page associated with previous tid * won't be used with current tid. If we fetch tid first, object and * page could be one associated with next tid and our alloc/free * request will be failed. In this case, we will retry. So, no problem. */barrier(); /* * The transaction ids are globally unique per cpu and per operation on * a per cpu queue. Thus they can be guarantee that the cmpxchg_double * occurs on the right processor and that there was no operation on the * linked list in between. */ object =c->freelist; page =c->page;if (unlikely(!object ||!node_match(page, node))) { object =__slab_alloc(s, gfpflags, node, addr, c);stat(s, ALLOC_SLOWPATH); }......return object;}
/* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * * Processing is still very fast if new objects have been freed to the * regular freelist. In that case we simply take over the regular freelist * as the lockless freelist and zap the regular freelist. * * If that is not working then we fall back to the partial lists. We take the * first element of the freelist as the object to allocate now and move the * rest of the freelist to the lockless freelist. * * And if we were unable to get a new slab from the partial slab lists then * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. * * Version of __slab_alloc to use when we know that interrupts are * already disabled (which is the case for bulk allocation). */staticvoid*___slab_alloc(struct kmem_cache *s,gfp_t gfpflags,int node,unsignedlong addr,struct kmem_cache_cpu *c){void*freelist;struct page *page; page =c->page;if (!page)goto new_slab;redo:if (unlikely(!node_match(page, node))) {int searchnode = node;if (node != NUMA_NO_NODE &&!node_present_pages(node)) searchnode =node_to_mem_node(node);if (unlikely(!node_match(page, searchnode))) {stat(s, ALLOC_NODE_MISMATCH);deactivate_slab(s, page,c->freelist, c);goto new_slab; } } /* * By rights, we should be searching for a slab page that was * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */if (unlikely(!pfmemalloc_match(page, gfpflags))) {deactivate_slab(s, page,c->freelist, c);goto new_slab; } /* must check again c->freelist in case of cpu migration or IRQ */ freelist =c->freelist;if (freelist)goto load_freelist; freelist =get_freelist(s, page);if (!freelist) {c->page =NULL;stat(s, DEACTIVATE_BYPASS);goto new_slab; }stat(s, ALLOC_REFILL);load_freelist: /* * freelist is pointing to the list of objects to be used. * page is pointing to the page from which the objects are obtained. * That page must be frozen for per cpu allocations to work. */VM_BUG_ON(!c->page->frozen);c->freelist =get_freepointer(s, freelist);c->tid =next_tid(c->tid);return freelist;new_slab:if (slub_percpu_partial(c)) { page =c->page =slub_percpu_partial(c);slub_set_percpu_partial(c, page);stat(s, CPU_PARTIAL_ALLOC);goto redo; } freelist =new_slab_objects(s, gfpflags, node,&c);if (unlikely(!freelist)) {slab_out_of_memory(s, gfpflags, node);returnNULL; } page =c->page;if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))goto load_freelist; /* Only entered in the debug case */if (kmem_cache_debug(s)&&!alloc_debug_processing(s, page, freelist, addr))goto new_slab; /* Slab failed checks. Next slab needed */deactivate_slab(s, page, get_freepointer(s, freelist), c);return freelist;}
调用new_slab()函数,向伙伴系统请求2^order个page,将请求的page构建成一个slab。分配的时候,要按 kmem_cache_order_objects 里面的 order 来。如果第一次分配不成功,说明内存已经很紧张了,那就换成 min 版本的 kmem_cache_order_objects。其调用链为new_slab()->allocate_slab()->alloc_slab_page()->__alloc_pages_node()->__alloc_pages()->__alloc_pages_nodemask(),从这里回到了伙伴系统,可见上文对该函数的分析。
staticinlinevoid*new_slab_objects(struct kmem_cache *s,gfp_t flags,int node,struct kmem_cache_cpu **pc){void*freelist;struct kmem_cache_cpu *c =*pc;struct page *page;WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); freelist =get_partial(s, flags, node, c);if (freelist)return freelist; page =new_slab(s, flags, node);if (page) { c =raw_cpu_ptr(s->cpu_slab);if (c->page)flush_slab(s, c); /* * No other reference to the page yet so we can * muck around with it freely without cmpxchg */ freelist =page->freelist;page->freelist =NULL;stat(s, ALLOC_SLAB);c->page = page;*pc = c; }return freelist;}
/* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */staticintkswapd(void*p){unsignedint alloc_order, reclaim_order;unsignedint classzone_idx = MAX_NR_ZONES -1;pg_data_t*pgdat = (pg_data_t*)p;struct task_struct *tsk = current;......for ( ; ; ) {......kswapd_try_sleep:kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, classzone_idx);...... /* * Reclaim begins at the requested order but if a high-order * reclaim fails then kswapd falls back to reclaiming for * order-0. If that happens, kswapd will consider sleeping * for the order it finished reclaiming at (reclaim_order) * but kcompactd is woken to compact for the original * request (alloc_order). */trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); reclaim_order =balance_pgdat(pgdat, alloc_order, classzone_idx);...... }return0;}