In the following situations, a page error exception (also called page missing interrupt) will occur:
1. The corresponding page directory entry or page table entry is empty, that is, the linear The mapping relationship between the address and the physical address has not been established or has been revoked.
2. The corresponding physical page is not in the memory. This article discusses this situation.
3. The access method specified in the instruction does not match the permissions of the page, such as trying to write a "read-only" page.
Assume that the mapping has been established, but the last bit P of the page table entry is 0, indicating that the page is not in the memory; the entire page table entry is as shown below, and offset indicates that the page is in The location of a disk device is the logical page number of the disk device; and type refers to which disk device the page is in.
The entry point of do_page_fault().
The code is as follows: arch/i386/mm/fault.c
The kernel’s interrupt/exception response mechanism is still Two parameters are passed in. One is the pt_regs structure pointer regs, which points to a copy of the contents of each register in the CPU before the exception occurs. The error_code further indicates the specific reason for the mapping failure.
asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code){ struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; unsigned long page; unsigned long fixup; int write; siginfo_t info; /* get the address */ __asm__("movl %%cr2,%0":"=r" (address));//把映射的失败的地址保存在address中 tsk = current;//task_struct /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */ if (address >= TASK_SIZE) goto vmalloc_fault; mm = tsk->mm;//mm_struct info.si_code = SEGV_MAPERR; /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_interrupt() || !mm) goto no_context; down(&mm->mmap_sem); vma = find_vma(mm, address);//找出结束地址大于给定地址的第一个区间。 if (!vma)//没有找到,说明没有一个区间的结束地址高于给定的地址,参考上图,说明这个地址是在堆栈之下,也就是3G字节以上了。 goto bad_area; if (vma->vm_start <= address)//起始地址不高于address,说明映射已经建立,转到good_area去进一步检查失败原因。 goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; ..../* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */good_area: info.si_code = SEGV_ACCERR; write = 0; switch (error_code & 3) {// 110 & 011 = 2 default: /* 3: write, present */#ifdef TEST_VERIFY_AREA if (regs->cs == KERNEL_CS) printk("WP fault at %08lx\n", regs->eip);#endif /* fall through */ case 2: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; write++;//执行到这里 break; case 1: /* read, present */ goto bad_area; case 0: /* read, not present */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ switch (handle_mm_fault(mm, vma, address, write)) { case 1: tsk->min_flt++; break; case 2: tsk->maj_flt++; break; case 0: goto do_sigbus; default: goto out_of_memory; } /* * Did it hit the DOS screen memory VA from vm86 mode? */ if (regs->eflags & VM_MASK) { unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; if (bit < 32) tsk->thread.screen_bitmap |= 1 << bit; } up(&mm->mmap_sem); return; .......}
bit 0 == 0 means no page found, 1 means protection fault
bit 1 == 0 means read, 1 means write
bit 2 == 0 means kernel, 1 means user-mode At this time, the error_code is 110, user mode, the page is not in the memory, write.
handle_mm_fault function, the code is as follows:
handle_pte_fault function, as follows:
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access){ int ret = -1; pgd_t *pgd; pmd_t *pmd; pgd = pgd_offset(mm, address);//返回页面表项指针 pmd = pmd_alloc(pgd, address);//中转了一下,还是页目录表项指针 if (pmd) { pte_t * pte = pte_alloc(pmd, address);//返回指向页表项的指针 if (pte) ret = handle_pte_fault(mm, vma, address, write_access, pte); } return ret;}
The do_swap_page function is as follows:
static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte){ pte_t entry; /* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); entry = *pte;//页表项中内容 if (!pte_present(entry)) {//页面不在内存中 /* * If it truly wasn't present, we know that kswapd * and the PTE updates will not touch it later. So * drop the lock. */ spin_unlock(&mm->page_table_lock); if (pte_none(entry))//页表项不为空 return do_no_page(mm, vma, address, write_access, pte); return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);//执行到这里 } if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); spin_unlock(&mm->page_table_lock); return 1;}
static int do_swap_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, pte_t * page_table, swp_entry_t entry, int write_access){ struct page *page = lookup_swap_cache(entry);//从hash表中寻找 pte_t pte; if (!page) { lock_kernel(); swapin_readahead(entry);//预读页面 page = read_swap_cache(entry);//真正得到一个页面,这个页面可能从hash表中寻找到,因为上面预读了。或者自己申请页面,并且从盘上将其内容读进来。 unlock_kernel(); if (!page) return -1; flush_page_to_ram(page); flush_icache_page(vma, page); } mm->rss++; pte = mk_pte(page, vma->vm_page_prot);//形成页表项 /* * Freeze the "shared"ness of the page, ie page_count + swap_count. * Must lock page before transferring our swap count to already * obtained page count. */ lock_page(page); swap_free(entry); if (write_access && !is_page_shared(page)) pte = pte_mkwrite(pte_mkdirty(pte));//页表项赋予已写过对应的物理页,可进行读、写或者执行 UnlockPage(page); set_pte(page_table, pte);//页表项(属性刚才已经设置了)指向对应的页面 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); return 1; /* Minor fault */}
1. Each function is explained below. First explain the swapin_readahead function, as follows:
Pre-read adjacent disks in advance. According to the following description, __get_free_page, page usage count is 1, add_to_swap_cache , the page usage count is increased by 1; at this time page_cache_release, the page usage count becomes 1 again. It does not become 2 until a process claims it.
void swapin_readahead(swp_entry_t entry){ int i, num; struct page *new_page; unsigned long offset; /* * Get the number of handles we should do readahead io to. Also, * grab temporary references on them, releasing them as io completes. */ num = valid_swaphandles(entry, &offset); for (i = 0; i < num; offset++, i++) { ...... new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0); if (new_page != NULL) page_cache_release(new_page);//page使用计数减1 swap_free(SWP_ENTRY(SWP_TYPE(entry), offset)); } return;}
add_to_swap_cache function is the focus, the code is as follows:
struct page * read_swap_cache_async(swp_entry_t entry, int wait){ struct page *found_page = 0, *new_page; unsigned long new_page_addr; /* * Make sure the swap entry is still in use. */ if (!swap_duplicate(entry)) /* Account for the swap cache */ goto out; /* * Look for the page in the swap cache. */ found_page = lookup_swap_cache(entry);//假设没有找到 if (found_page) goto out_free_swap; new_page_addr = __get_free_page(GFP_USER);//刚申请的page结构,使用计数为1 if (!new_page_addr) goto out_free_swap; /* Out of memory */ new_page = virt_to_page(new_page_addr);//转化成对应的page结构指针 /* * Check the swap cache again, in case we stalled above. */ found_page = lookup_swap_cache(entry);//假设没有找到 if (found_page) goto out_free_page; /* * Add it to the swap cache and read its contents. */ lock_page(new_page); add_to_swap_cache(new_page, entry);//加入到对应的链表上 rw_swap_page(READ, new_page, wait);//真正的把磁盘上的数据读到新申请的page上,等待块设备驱动一章再来看 return new_page;out_free_page: page_cache_release(new_page);out_free_swap: swap_free(entry);out: return found_page;}
void add_to_swap_cache(struct page *page, swp_entry_t entry){ unsigned long flags;#ifdef SWAP_CACHE_INFO swap_cache_add_total++;#endif if (!PageLocked(page)) BUG(); if (PageTestandSetSwapCache(page)) BUG(); if (page->mapping) BUG(); flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1)); page->flags = flags | (1 << PG_uptodate); add_to_page_cache_locked(page, &swapper_space, entry.val);}
add_to_page_cache_locked function, the code is as follows:
void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index){ if (!PageLocked(page)) BUG(); page_cache_get(page);//增加了使用计数,现在使用计数为2 spin_lock(&pagecache_lock); page->index = index;//index存着页面交换项 add_page_to_inode_queue(mapping, page);//page->list链入mapping->clean_pages add_page_to_hash_queue(page, page_hash(mapping, index));//page->next_hash和page->pprev_hash链入全局的Hash表 lru_cache_add(page);//page->lru链入了全局的active_list spin_unlock(&pagecache_lock);}
The add_page_to_hash_queue function is as follows:
static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page){ struct list_head *head = &mapping->clean_pages; mapping->nrpages++; list_add(&page->list, head);//page->list链入mapping->clean_pages page->mapping = mapping;//mapping指向了swapper_space}
struct address_space swapper_space = { LIST_HEAD_INIT(swapper_space.clean_pages), LIST_HEAD_INIT(swapper_space.dirty_pages), LIST_HEAD_INIT(swapper_space.locked_pages), 0, /* nrpages */ &swap_aops,};
static void add_page_to_hash_queue(struct page * page, struct page **p){ struct page *next = *p;//page->next_hash和page->pprev_hash链入全局的Hash表 *p = page; page->next_hash = next; page->pprev_hash = p; if (next) next->pprev_hash = &page->next_hash; if (page->buffers) PAGE_BUG(page); atomic_inc(&page_cache_size);}
void lru_cache_add(struct page * page){ spin_lock(&pagemap_lru_lock); if (!PageLocked(page)) BUG(); DEBUG_ADD_PAGE add_page_to_active_list(page); /* This should be relatively rare */ if (!page->age) deactivate_page_nolock(page); spin_unlock(&pagemap_lru_lock);}
2 , the read_swap_cache function is explained below, as follows:
#define add_page_to_active_list(page) { \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ SetPageActive(page); \ list_add(&(page)->lru, &active_list); \ //page->lru链入了全局的active_list nr_active_pages++; \ //全局的nr_active_pages加1}
#define read_swap_cache(entry) read_swap_cache_async(entry, 1);
struct page * read_swap_cache_async(swp_entry_t entry, int wait){ struct page *found_page = 0, *new_page; unsigned long new_page_addr; /* * Make sure the swap entry is still in use. */ if (!swap_duplicate(entry)) /* Account for the swap cache */ goto out; /* * Look for the page in the swap cache. */ found_page = lookup_swap_cache(entry);//假设在hash表中找到对应的page,有进程认领了,使用计数为2 if (found_page) goto out_free_swap; new_page_addr = __get_free_page(GFP_USER); if (!new_page_addr) goto out_free_swap; /* Out of memory */ new_page = virt_to_page(new_page_addr); /* * Check the swap cache again, in case we stalled above. */ found_page = lookup_swap_cache(entry);//有可能__get_free_page,没有足够的可分配的页面,切换到其他进程了,再切回来时,在Hash表中再寻找一遍 if (found_page) goto out_free_page; /* * Add it to the swap cache and read its contents. */ lock_page(new_page); add_to_swap_cache(new_page, entry); rw_swap_page(READ, new_page, wait); return new_page;out_free_page: page_cache_release(new_page);out_free_swap: swap_free(entry);out: return found_page;}
find_lock_page function, as follows:
struct page * lookup_swap_cache(swp_entry_t entry){ struct page *found;#ifdef SWAP_CACHE_INFO swap_cache_find_total++;#endif while (1) { /* * Right now the pagecache is 32-bit only. But it's a 32 bit index. =) */repeat: found = find_lock_page(&swapper_space, entry.val);//entry.val为页面交换项 if (!found) return 0; /* * Though the "found" page was in the swap cache an instant * earlier, it might have been removed by refill_inactive etc. * Re search ... Since find_lock_page grabs a reference on * the page, it can not be reused for anything else, namely * it can not be associated with another swaphandle, so it * is enough to check whether the page is still in the scache. */ if (!PageSwapCache(found)) { UnlockPage(found); page_cache_release(found); goto repeat; } if (found->mapping != &swapper_space) goto out_bad;#ifdef SWAP_CACHE_INFO swap_cache_find_success++;#endif UnlockPage(found); return found;}
__find_lock_page function, as follows:
#define find_lock_page(mapping, index) \ __find_lock_page(mapping, index, page_hash(mapping, index))
__find_page_nolock function, as follows:
struct page * __find_lock_page (struct address_space *mapping, unsigned long offset, struct page **hash){ struct page *page; /* * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */repeat: spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, *hash);//得到了hash表的其中一个链表的头 if (page) { page_cache_get(page);//增加使用计数 spin_unlock(&pagecache_lock); lock_page(page); /* Is the page still hashed? Ok, good.. */ if (page->mapping) return page; /* Nope: we raced. Release and try again.. */ UnlockPage(page); page_cache_release(page); goto repeat; } spin_unlock(&pagecache_lock); return NULL;}
Based on the page exchange item, search the page structure in the hash table.
static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page){ goto inside; for (;;) { page = page->next_hash;//从hash表中寻找inside: if (!page) goto not_found; if (page->mapping != mapping) continue; if (page->index == offset) break; } /* * Touching the page may move it to the active list. * If we end up with too few inactive pages, we wake * up kswapd. */ age_page_up(page); if (inactive_shortage() > inactive_target / 2 && free_shortage()) wakeup_kswapd(0);not_found: return page;}
read_swap_cache no matter reads the page from the hash table, or applies for the page itself, and adds it to the corresponding linked list. The final usage count is all 2.
Swapin_readahead pre-reads many pages. If it is not claimed by the process, the usage count is 1.
swapin_readahead(entry);//预读页面 page = read_swap_cache(entry);//真正得到一个页面,这个页面可能从hash表中寻找到,因为上面预读了。或者自己申请页面,并且从盘上将其内容读进来。