We have seen that when allocating pages, if the number of pages is not enough, page_launder, reclaim_page, __free_page will be called to swap out the page and put it back into allocation.
In order to avoid temporarily searching for memory pages that can be swapped out and swapping them out when the CPU is busy, that is, when a page fault exception occurs, the Linux kernel regularly checks and pre-registers a number of pages. Pages are swapped out to free up space to reduce the burden on the system when page fault exceptions occur.
To this end, a "patron saint" kswapd and kreclaimd are set up in the Linux kernel to regularly swap out pages.
static int __init kswapd_init(void){ printk("Starting kswapd v1.8\n"); swap_setup(); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); return 0;}
First analyze kswapd, the code is as follows:
int kswapd(void *unused){ struct task_struct *tsk = current; tsk->session =1; tsk->pgrp = 1; strcpy(tsk->comm, "kswapd"); sigfillset(&tsk->blocked); kswapd_task = tsk; ..... tsk->flags |= PF_MEMALLOC;//执行公务,标志位置1 /* * Kswapd main loop. */ for (;;) { static int recalc = 0; /* If needed, try to free some memory. */ if (inactive_shortage() || free_shortage()) { int wait = 0; /* Do we need to do some synchronous flushing? */ if (waitqueue_active(&kswapd_done)) wait = 1; do_try_to_free_pages(GFP_KSWAPD, wait);//主体函数 } ...... refill_inactive_scan(6, 0); ...... ...... if (!free_shortage() || !inactive_shortage()) { interruptible_sleep_on_timeout(&kswapd_wait, HZ);//每隔1秒钟唤醒一次,继续执行循环 ...... } else if (out_of_memory()) { oom_kill(); } }}
In some simple After the initialization operation, the program enters an infinite loop. At the end of each loop, interruptible_sleep_on_timeout() is generally called to enter sleep, allowing the kernel to freely schedule other processes to run. However, the kernel will wake up after 1 second and schedule kswapd to continue running. At this time, kswapd will return to the place where the infinite loop began.
The main function executed by this function is do_try_to_free_pages, the code is as follows:
static int do_try_to_free_pages(unsigned int gfp_mask, int user){ int ret = 0; ...... if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() + nr_inactive_clean_pages()) ret += page_launder(gfp_mask, user); ...... if (free_shortage() || inactive_shortage()) { shrink_dcache_memory(6, gfp_mask); shrink_icache_memory(6, gfp_mask); ret += refill_inactive(gfp_mask, user); } else { ...... kmem_cache_reap(gfp_mask); ret = 1; } return ret;}
kmem_cache_reap is used to harvest slab blocks. The slab management mechanism also tends to allocate and maintain more free physical pages and is not keen on returning these pages, so it will be harvested through kmem_cache_reap after a while.
1. The first thing we analyze is refill_inactive, the code is as follows:
static int refill_inactive(unsigned int gfp_mask, int user){ int priority, count, start_count, made_progress; count = inactive_shortage() + free_shortage(); if (user) count = (1 << page_cluster); start_count = count; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask);//收割slab priority = 6;//循环从优先级最低的6级开始,逐步加大"力度"直到0级, do { made_progress = 0; if (current->need_resched) {//内核线程必须自律,因为永远不会返回用户空间,就永远不会检查这个标志位 __set_current_state(TASK_RUNNING);//表示希望继续执行的愿望 schedule();//调度 } while (refill_inactive_scan(priority, 1)) { made_progress = 1; if (--count <= 0)//达到目标,就提前结束 goto done; } ...... shrink_dcache_memory(priority, gfp_mask);//回收积累起来的大量的dentry数据结构和inode数据结构 shrink_icache_memory(priority, gfp_mask); ...... while (swap_out(priority, gfp_mask)) { made_progress = 1; if (--count <= 0)//达到目标,就提前结束 goto done; } ...... if (!inactive_shortage() || !free_shortage()) goto done;//不缺少页面了,也提前结束 ...... if (!made_progress) priority--; } while (priority >= 0); /* Always end on a refill_inactive.., may sleep... */ while (refill_inactive_scan(0, 1)) { if (--count <= 0) goto done; }done: return (count < start_count);}
int refill_inactive_scan(unsigned int priority, int oneshot){ struct list_head * page_lru; struct page * page; int maxscan, page_active = 0; int ret = 0; /* Take the lock while messing with the list... */ spin_lock(&pagemap_lru_lock); maxscan = nr_active_pages >> priority;//当priority为0时,才扫描整个队列 while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ if (!PageActive(page)) {//是否是活跃的页面 printk("VM: refill_inactive, wrong page on list.\n"); list_del(page_lru); nr_active_pages--; continue; } /* Do aging on the pages. */ if (PageTestandClearReferenced(page)) {//是否受到过访问 age_page_up_nolock(page); page_active = 1; } else { age_page_down_ageonly(page);//没有受到过访问,减少页面寿命 ...... if (page->age == 0 && page_count(page) <= (page->buffers ? 2 : 1)) {//如果页面寿命为0,且使用计数为1(预读后,未被进程认领的页面) deactivate_page_nolock(page); page_active = 0; } else { page_active = 1; } } ...... if (page_active || PageActive(page)) { list_del(page_lru); list_add(page_lru, &active_list); } else { ret = 1; if (oneshot) break; } } spin_unlock(&pagemap_lru_lock); return ret;}
After pre-reading, it was not The last usage count of the page claimed by the process is 1;
page->list is linked to mapping->clean_pages;
page->next_hash and page->pprev_hash are linked to global Hash table;
page->lru is linked to the global active_list;
After pre-reading, pages that are not claimed by the process will execute deactivate_page_nolock, the code is as follows:
void deactivate_page_nolock(struct page * page){ /* * One for the cache, one for the extra reference the * caller has and (maybe) one for the buffers. * * This isn't perfect, but works for just about everything. * Besides, as long as we don't move unfreeable pages to the * inactive_clean list it doesn't need to be perfect... */ int maxcount = (page->buffers ? 3 : 2); page->age = 0; ClearPageReferenced(page); /* * Don't touch it if it's not on the active list. * (some pages aren't on any list at all) */ if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) { del_page_from_active_list(page); add_page_to_inactive_dirty_list(page); }}
#define del_page_from_active_list(page) { \ list_del(&(page)->lru); \ ClearPageActive(page); \ nr_active_pages--; \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \}
#define add_page_to_inactive_dirty_list(page) { \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ SetPageInactiveDirty(page); \ list_add(&(page)->lru, &inactive_dirty_list); \ nr_inactive_dirty_pages++; \ page->zone->inactive_dirty_pages++; \}
The final usage count is 1;
page->list is linked to mapping->clean_pages; // Since it has not been visited
page->next_hash and page->pprev_hash are linked to the global Hash table;
page->lru is linked to the global inactive_dirty_list;
There is no need to disconnect the mapping, because there is no mapping in the first place.
2. The swap_out function is as follows:
static int swap_out(unsigned int priority, int gfp_mask){ int counter; int __ret = 0; ...... counter = (nr_threads << SWAP_SHIFT) >> priority;//优先级越大,counter越大 if (counter < 1) counter = 1; for (; counter >= 0; counter--) { struct list_head *p; unsigned long max_cnt = 0; struct mm_struct *best = NULL; int assign = 0; int found_task = 0; select: spin_lock(&mmlist_lock); p = init_mm.mmlist.next; for (; p != &init_mm.mmlist; p = p->next) { struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist); if (mm->rss <= 0) continue; found_task++; /* Refresh swap_cnt? */ if (assign == 1) { mm->swap_cnt = (mm->rss >> SWAP_SHIFT); if (mm->swap_cnt < SWAP_MIN) mm->swap_cnt = SWAP_MIN; } if (mm->swap_cnt > max_cnt) {//swap_cnt表示该进程尚未被考察的页面,找出swap_cnt最大的进程 max_cnt = mm->swap_cnt; best = mm; } } /* Make sure it doesn't disappear */ if (best) atomic_inc(&best->mm_users);//增加mm_users spin_unlock(&mmlist_lock); ...... if (!best) { if (!assign && found_task > 0) { assign = 1; goto select; } break; } else { __ret = swap_out_mm(best, gfp_mask);//执行主体 mmput(best);//减少mm_users break; } } return __ret;}
The try_to_swap_out function is as follows:
static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask){ pte_t pte; swp_entry_t entry; struct page * page; int onlist; pte = *page_table; if (!pte_present(pte))//物理页面是否在内存中 goto out_failed; page = pte_page(pte); if ((!VALID_PAGE(page)) || PageReserved(page)) goto out_failed; if (!mm->swap_cnt) return 1; mm->swap_cnt--;//被考察的页面数减1 onlist = PageActive(page); /* Don't look at this pte if it's been accessed recently. */ if (ptep_test_and_clear_young(page_table)) {//如果页面被访问过,那么直接out_failed age_page_up(page); goto out_failed; } if (!onlist) /* The page is still mapped, so it can't be freeable... */ age_page_down_ageonly(page); ...... if (page->age > 0)//如果页面的age不小于0,页out_failed goto out_failed; if (TryLockPage(page)) goto out_failed; ...... pte = ptep_get_and_clear(page_table);//走到这里,说明页面最近没有访问过,且age小于0,清空页目录项 flush_tlb_page(vma, address); ...... if (PageSwapCache(page)) {//page结构在swapper_space队列中 entry.val = page->index;//盘上数据块的位置 if (pte_dirty(pte))//回忆页面换入时,页目录项的属性被设置为可写,脏 set_page_dirty(page);//会执行set_swap_pte: swap_duplicate(entry); set_pte(page_table, swp_entry_to_pte(entry));//页目录项指向盘上数据块的地址drop_pte: UnlockPage(page); mm->rss--; deactivate_page(page);//见上面的函数 page_cache_release(page);//使用计数减1out_failed: return 0; } ......}
The first point is that it has not been accessed recently. The judgment standard is:
static inline int ptep_test_and_clear_young(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep); }
Second point, page->age is less than 0.
set_page_dirty function, as follows:
static inline void set_page_dirty(struct page * page){ if (!test_and_set_bit(PG_dirty, &page->flags)) __set_page_dirty(page);}
void __set_page_dirty(struct page *page){ struct address_space *mapping = page->mapping; spin_lock(&pagecache_lock); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); spin_unlock(&pagecache_lock); mark_inode_dirty_pages(mapping->host);}
Finally, try_to_swap_out is executed, and the result is:
The usage count is 1;
page ->list is linked to mapping->dirty_pages;
page->next_hash and page->pprev_hash are linked to the global Hash table;
page->lru is linked Global inactive_dirty_list;
page->flags对应为设置为PG_dirty。
由于out_failed返回0,使swap_out_mm能够依次考察和处理一个进程的所有页面。
二、看完了refill_inactive,返回函数do_try_to_free_pages,来看一下page_launder,代码如下:
int page_launder(int gfp_mask, int sync){ int launder_loop, maxscan, cleaned_pages, maxlaunder; int can_get_io_locks; struct list_head * page_lru; struct page * page; /* * We can only grab the IO locks (eg. for flushing dirty * buffers to disk) if __GFP_IO is set. */ can_get_io_locks = gfp_mask & __GFP_IO; launder_loop = 0; maxlaunder = 0; cleaned_pages = 0;dirty_page_rescan: spin_lock(&pagemap_lru_lock); maxscan = nr_inactive_dirty_pages; while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list && maxscan-- > 0) { page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ if (!PageInactiveDirty(page)) {//如果不是不活跃脏的页面,冤假错案 printk("VM: page_launder, wrong page on list.\n"); list_del(page_lru); nr_inactive_dirty_pages--; page->zone->inactive_dirty_pages--; continue; } /* Page is or was in use? Move it to the active list. */ if (PageTestandClearReferenced(page) || page->age > 0 || //这个以后会在单独的博客中解释 (!page->buffers && page_count(page) > 1) || page_ramdisk(page)) { del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); continue; } /* * The page is locked. IO in progress? * Move it to the back of the list. */ if (TryLockPage(page)) {//如果已经上锁 list_del(page_lru); list_add(page_lru, &inactive_dirty_list);//挂在inactive_dirty_list的最后 continue; } /* * Dirty swap-cache page? Write it out if * last copy.. */ if (PageDirty(page)) {//如果页面是脏的 int (*writepage)(struct page *) = page->mapping->a_ops->writepage; int result; if (!writepage) goto page_active; /* First time through? Move it to the back of the list */ if (!launder_loop) {//循环第一遍launder_loop为0 list_del(page_lru); list_add(page_lru, &inactive_dirty_list);//挂到inactive_dirty_list的最后 UnlockPage(page); continue; } //第二次循环,执行到这里 ClearPageDirty(page);//清空脏标志位 page_cache_get(page);//使用计数加1 spin_unlock(&pagemap_lru_lock); result = writepage(page);//同步到磁盘,页面就变干净了 page_cache_release(page);//使用计数减1 /* And re-start the thing.. */ spin_lock(&pagemap_lru_lock); if (result != 1) continue; /* writepage refused to do anything */ set_page_dirty(page);//不会执行到这里 goto page_active; } //执行到这里时,一定是干净的页面 ...... if (page->buffers) { ...... } else if (page->mapping && !PageDirty(page)) {//如果页面是干净的,包括刚才是脏的,现在变成干净的 ....... del_page_from_inactive_dirty_list(page); add_page_to_inactive_clean_list(page); UnlockPage(page); cleaned_pages++; } else {page_active: ...... del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); UnlockPage(page); } } spin_unlock(&pagemap_lru_lock); ...... if (can_get_io_locks && !launder_loop && free_shortage()) {//缺少可供分配的页面 launder_loop = 1;//一共最多进行两次循环 /* If we cleaned pages, never do synchronous IO. */ if (cleaned_pages) sync = 0; /* We only do a few "out of order" flushes. */ maxlaunder = MAX_LAUNDER; /* Kflushd takes care of the rest. */ wakeup_bdflush(0); goto dirty_page_rescan;//返回到dirty_page_rescan } /* Return the number of pages moved to the inactive_clean list. */ return cleaned_pages;}
#define del_page_from_inactive_dirty_list(page) { \ list_del(&(page)->lru); \ ClearPageInactiveDirty(page); \ nr_inactive_dirty_pages--; \ page->zone->inactive_dirty_pages--; \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \}
#define add_page_to_inactive_clean_list(page) { \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ SetPageInactiveClean(page); \ list_add(&(page)->lru, &page->zone->inactive_clean_list); \ page->zone->inactive_clean_pages++; \}
最后执行完page_launder,结果是:
使用计数为1;
page->list链入mapping->dirty_pages或者clean_pages(保持原样);
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了page->zone->inactive_clean_list;
然后,我们分析kreclaimd,代码如下:
int kreclaimd(void *unused){ struct task_struct *tsk = current; pg_data_t *pgdat; tsk->session = 1; tsk->pgrp = 1; strcpy(tsk->comm, "kreclaimd"); sigfillset(&tsk->blocked); current->flags |= PF_MEMALLOC;//执行公务 while (1) { /* * We sleep until someone wakes us up from * page_alloc.c::__alloc_pages(). */ interruptible_sleep_on(&kreclaimd_wait); /* * Move some pages from the inactive_clean lists to * the free lists, if it is needed. */ pgdat = pgdat_list; do { int i; for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones + i; if (!zone->size) continue; while (zone->free_pages < zone->pages_low) { struct page * page; page = reclaim_page(zone);//主体代码 if (!page) break; __free_page(page); } } pgdat = pgdat->node_next; } while (pgdat); }}
struct page * reclaim_page(zone_t * zone){ struct page * page = NULL; struct list_head * page_lru; int maxscan; /* * We only need the pagemap_lru_lock if we don't reclaim the page, * but we have to grab the pagecache_lock before the pagemap_lru_lock * to avoid deadlocks and most of the time we'll succeed anyway. */ spin_lock(&pagecache_lock); spin_lock(&pagemap_lru_lock); maxscan = zone->inactive_clean_pages; while ((page_lru = zone->inactive_clean_list.prev) != &zone->inactive_clean_list && maxscan--) {//扫描zone->inactive_clean_list page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ if (!PageInactiveClean(page)) {//冤假错案 printk("VM: reclaim_page, wrong page on list.\n"); list_del(page_lru); page->zone->inactive_clean_pages--; continue; } /* Page is or was in use? Move it to the active list. */ if (PageTestandClearReferenced(page) || page->age > 0 || (!page->buffers && page_count(page) > 1)) {//这个会用单独的博客介绍 del_page_from_inactive_clean_list(page); add_page_to_active_list(page); continue; } /* The page is dirty, or locked, move to inactive_dirty list. */ if (page->buffers || PageDirty(page) || TryLockPage(page)) { del_page_from_inactive_clean_list(page); add_page_to_inactive_dirty_list(page); continue; } /* OK, remove the page from the caches. */ if (PageSwapCache(page)) {//page结构在swapper_space队列中 __delete_from_swap_cache(page);//执行这里 goto found_page; } if (page->mapping) { __remove_inode_page(page); goto found_page; } /* We should never ever get here. */ printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); list_del(page_lru); zone->inactive_clean_pages--; UnlockPage(page); } /* Reset page pointer, maybe we encountered an unfreeable page. */ page = NULL; goto out;found_page: del_page_from_inactive_clean_list(page);//执行这里 UnlockPage(page); page->age = PAGE_AGE_START; if (page_count(page) != 1) printk("VM: reclaim_page, found page with count %d!\n", page_count(page));out: spin_unlock(&pagemap_lru_lock); spin_unlock(&pagecache_lock); memory_pressure++; return page;}
__delete_from_swap_cache函数,代码如下:
void __delete_from_swap_cache(struct page *page){ swp_entry_t entry; entry.val = page->index;#ifdef SWAP_CACHE_INFO swap_cache_del_total++;#endif remove_from_swap_cache(page); swap_free(entry);}
static inline void remove_from_swap_cache(struct page *page){ struct address_space *mapping = page->mapping; if (mapping != &swapper_space) BUG(); if (!PageSwapCache(page) || !PageLocked(page)) PAGE_BUG(page); PageClearSwapCache(page); ClearPageDirty(page); __remove_inode_page(page);}
void __remove_inode_page(struct page *page){ if (PageDirty(page)) BUG(); remove_page_from_inode_queue(page); remove_page_from_hash_queue(page); page->mapping = NULL;}
static inline void remove_page_from_inode_queue(struct page * page){ struct address_space * mapping = page->mapping; mapping->nrpages--; list_del(&page->list); page->mapping = NULL;}
static inline void remove_page_from_hash_queue(struct page * page){ struct page *next = page->next_hash; struct page **pprev = page->pprev_hash; if (next) next->pprev_hash = pprev; *pprev = next; page->pprev_hash = NULL; atomic_dec(&page_cache_size);}
#define del_page_from_inactive_clean_list(page) { \ list_del(&(page)->lru); \ ClearPageInactiveClean(page); \ page->zone->inactive_clean_pages--; \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \}
最后执行完reclaim_page,结果是:
使用计数为1;
page->list为空;
page->next_hash和page->pprev_hash位空;
page->lru为空;
回到kreclaimd,会执行__free_page,此时使用计数减为0,回收这个页面到free_area[MAX_ORDER],下次alloc_page就能分配到了。
void __free_pages(struct page *page, unsigned long order){ if (!PageReserved(page) && put_page_testzero(page))//使用计数为0 __free_pages_ok(page, order);}
kswapd内核线程:
1、refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。
2、page_launder,把不活跃脏的页面变成不活跃干净的页面。
kreclaimd内核线程:
3、把不活跃干净的页面,所有的链表关系都清除,但使用计数仍然为1。
4、__free_page,此时使用计数减为0,回收这个页面到free_area[MAX_ORDER],下次alloc_page就能分配到了。