diff -Nur linux-2.4.24/arch/i386/kernel/entry.S linux-2.4.24-vmcomm/arch/i386/kernel/entry.S --- linux-2.4.24/arch/i386/kernel/entry.S 2003-06-13 10:51:29.000000000 -0400 +++ linux-2.4.24-vmcomm/arch/i386/kernel/entry.S 2004-07-17 16:50:10.000000000 -0400 @@ -663,6 +663,9 @@ .long SYMBOL_NAME(sys_ni_syscall) /* sys_epoll_wait */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_remap_file_pages */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_set_tid_address */ + .long SYMBOL_NAME(sys_vm_register) /* 259 */ + .long SYMBOL_NAME(sys_vm_relinquish) /* 260 */ + .long SYMBOL_NAME(sys_vm_getswaprate) /* 261 */ .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) diff -Nur linux-2.4.24/fs/exec.c linux-2.4.24-vmcomm/fs/exec.c --- linux-2.4.24/fs/exec.c 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/fs/exec.c 2004-07-17 16:50:14.000000000 -0400 @@ -311,6 +311,8 @@ if (vma) prot = vma->vm_page_prot; set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); + /* LWRMAP */ + page_add_rmap(page, pte); tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); diff -Nur linux-2.4.24/include/asm-generic/rmap.h linux-2.4.24-vmcomm/include/asm-generic/rmap.h --- linux-2.4.24/include/asm-generic/rmap.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.4.24-vmcomm/include/asm-generic/rmap.h 2004-07-17 16:50:14.000000000 -0400 @@ -0,0 +1,60 @@ +#ifndef _GENERIC_RMAP_H +#define _GENERIC_RMAP_H + +/* LWRMAP: This entire file is a necessary addition. */ + +/* + * linux/include/asm-generic/rmap.h + * + * Architecture dependant parts of the reverse mapping code, + * this version should work for most architectures with a + * 'normal' page table layout. + * + * We use the struct page of the page table page to find out + * the process and full address of a page table entry: + * - page->mapping points to the process' mm_struct + * - page->index has the high bits of the address + * - the lower bits of the address are calculated from the + * offset of the page table entry within the page table page + */ +#include + +static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address) +{ + struct page * page = virt_to_page(ptep); +#ifdef BROKEN_PPC_PTE_ALLOC_ONE + /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ + extern int mem_init_done; + + if (!mem_init_done) + return; +#endif + page->mapping = (void *)mm; + page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_remove_rmap(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + + page->mapping = NULL; + page->index = 0; +} + +static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + + return (struct mm_struct *) page->mapping; +} + +static inline unsigned long ptep_to_address(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + unsigned long low_bits; + + low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + return page->index + low_bits; +} + +#endif /* _GENERIC_RMAP_H */ diff -Nur linux-2.4.24/include/asm-i386/rmap.h linux-2.4.24-vmcomm/include/asm-i386/rmap.h --- linux-2.4.24/include/asm-i386/rmap.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.4.24-vmcomm/include/asm-i386/rmap.h 2004-07-17 16:50:14.000000000 -0400 @@ -0,0 +1,9 @@ +#ifndef _I386_RMAP_H +#define _I386_RMAP_H + +/* LWRMAP: This entire file is a necessary addition. */ + +/* nothing to see, move along */ +#include + +#endif diff -Nur linux-2.4.24/include/asm-i386/siginfo.h linux-2.4.24-vmcomm/include/asm-i386/siginfo.h --- linux-2.4.24/include/asm-i386/siginfo.h 2002-08-02 20:39:45.000000000 -0400 +++ linux-2.4.24-vmcomm/include/asm-i386/siginfo.h 2004-07-17 16:50:14.000000000 -0400 @@ -183,6 +183,13 @@ #define NSIGPOLL 6 /* + * SIGVM flags, used to register and returned in siginfo + */ +#define VM_SWAPPING_OUT 0x1 +#define VM_SWAPPED_OUT 0x2 +#define VM_SWAPPED_IN 0x4 + +/* * sigevent definitions * * It seems likely that SIGEV_THREAD will have to be handled from diff -Nur linux-2.4.24/include/asm-i386/signal.h linux-2.4.24-vmcomm/include/asm-i386/signal.h --- linux-2.4.24/include/asm-i386/signal.h 2001-11-22 14:46:18.000000000 -0500 +++ linux-2.4.24-vmcomm/include/asm-i386/signal.h 2004-07-17 16:50:14.000000000 -0400 @@ -70,6 +70,8 @@ #define SIGRTMIN 32 #define SIGRTMAX (_NSIG-1) +#define SIGVM 35 /* new queueable vm signal */ + /* * SA_FLAGS values: * diff -Nur linux-2.4.24/include/asm-i386/unistd.h linux-2.4.24-vmcomm/include/asm-i386/unistd.h --- linux-2.4.24/include/asm-i386/unistd.h 2002-11-28 18:53:15.000000000 -0500 +++ linux-2.4.24-vmcomm/include/asm-i386/unistd.h 2004-07-17 16:50:14.000000000 -0400 @@ -258,6 +258,11 @@ #define __NR_free_hugepages 251 #define __NR_exit_group 252 +/* the following are my new vm system calls (see also in entry.S) */ +#define __NR_vm_register 259 +#define __NR_vm_relinquish 260 +#define __NR_vm_getswaprate 261 + /* user-visible error numbers are in the range -1 - -124: see */ #define __syscall_return(type, res) \ diff -Nur linux-2.4.24/include/asm-um/arch/rmap.h linux-2.4.24-vmcomm/include/asm-um/arch/rmap.h --- linux-2.4.24/include/asm-um/arch/rmap.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.4.24-vmcomm/include/asm-um/arch/rmap.h 2004-07-17 16:50:14.000000000 -0400 @@ -0,0 +1,9 @@ +#ifndef _I386_RMAP_H +#define _I386_RMAP_H + +/* LWRMAP: This entire file is a necessary addition. */ + +/* nothing to see, move along */ +#include + +#endif diff -Nur linux-2.4.24/include/asm-um/rmap.h linux-2.4.24-vmcomm/include/asm-um/rmap.h --- linux-2.4.24/include/asm-um/rmap.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.4.24-vmcomm/include/asm-um/rmap.h 2004-07-17 16:50:22.000000000 -0400 @@ -0,0 +1,6 @@ +#ifndef _UM_RMAP_H +#define _UM_RMAP_H + +#include + +#endif diff -Nur linux-2.4.24/include/linux/mm.h linux-2.4.24-vmcomm/include/linux/mm.h --- linux-2.4.24/include/linux/mm.h 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/include/linux/mm.h 2004-07-17 16:50:22.000000000 -0400 @@ -136,6 +136,14 @@ struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); }; +/* LWRMAP: forward declaration; pte_chain is meant to be internal to + * rmap.c */ +/* struct pte_chain is no longer internal to rmap.c */ +struct pte_chain { + struct pte_chain * next; + pte_t * ptep; +}; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -162,6 +170,10 @@ updated asynchronously */ struct list_head lru; /* Pageout list, eg. active_list; protected by pagemap_lru_lock !! */ + struct pte_chain * pte_chain; /* LWRMAP: Reverse pte mapping + * pointer. protected by + * PG_chainlock + */ struct page **pprev_hash; /* Complement to *next_hash. */ struct buffer_head * buffers; /* Buffer maps us to a disk block. */ @@ -300,6 +312,7 @@ #define PG_reserved 14 #define PG_launder 15 /* written out by VM pressure.. */ #define PG_fs_1 16 /* Filesystem specific */ +#define PG_chainlock 17 /* LWRMAP: lock bit for ->pte_chain */ #ifndef arch_set_page_uptodate #define arch_set_page_uptodate(page) @@ -329,6 +342,36 @@ #define ClearPageArch1(page) clear_bit(PG_arch_1, &(page)->flags) /* + * LWRMAP: inlines for acquisition of PG_chainlock + */ +static inline void pte_chain_lock(struct page *page) +{ + /* + * Assuming the lock is uncontended, this never enters + * the body of the outer loop. If it is contended, then + * within the inner loop a non-atomic test is used to + * busywait with less bus contention for a good time to + * attempt to acquire the lock bit. + */ +#ifdef CONFIG_SMP + while (test_and_set_bit(PG_chainlock, &page->flags)) { + while (test_bit(PG_chainlock, &page->flags)) + cpu_relax(); + } +#endif +} + +/* + * LWRMAP: inlines for release of PG_chainlock + */ +static inline void pte_chain_unlock(struct page *page) +{ +#ifdef CONFIG_SMP + clear_bit(PG_chainlock, &page->flags); +#endif +} + +/* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. */ @@ -527,7 +570,8 @@ extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); -extern void swapin_readahead(swp_entry_t); +/* extern void swapin_readahead(swp_entry_t); */ +extern void swapin_readahead(swp_entry_t, int *); /* added a parameter to store fault type in a vector allocated by caller */ extern struct address_space swapper_space; #define PageSwapCache(page) ((page)->mapping == &swapper_space) diff -Nur linux-2.4.24/include/linux/sched.h linux-2.4.24-vmcomm/include/linux/sched.h --- linux-2.4.24/include/linux/sched.h 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/include/linux/sched.h 2004-07-17 16:50:22.000000000 -0400 @@ -229,6 +229,9 @@ unsigned dumpable:1; + struct task_struct * sigproc; /* specifies the process interested in receiving the vm signals (currently limited to only one process per mm_struct) */ + unsigned long sigflag; /* specifies which types of vm signal the process wants to receive */ + /* Architecture-specific MM context */ mm_context_t context; }; diff -Nur linux-2.4.24/include/linux/swap.h linux-2.4.24-vmcomm/include/linux/swap.h --- linux-2.4.24/include/linux/swap.h 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/include/linux/swap.h 2004-07-17 16:50:22.000000000 -0400 @@ -1,6 +1,8 @@ #ifndef _LINUX_SWAP_H #define _LINUX_SWAP_H +#define ERIC_DEBUG /* my debug flag, undef it to make code faster */ + #include #include @@ -68,6 +70,7 @@ struct dentry * swap_file; struct vfsmount *swap_vfsmnt; unsigned short * swap_map; + struct pte_chain ** pte_chain_map; /* reverse mapping for entries of this swap area */ unsigned int lowest_bit; unsigned int highest_bit; unsigned int cluster_next; @@ -102,6 +105,19 @@ struct zone_t; +/* LWRMAP: linux/mm/rmap.c */ +extern void FASTCALL(page_add_rmap(struct page *, pte_t *)); +extern void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +extern struct pte_chain ** FASTCALL(get_swap_pte_chain_map(swp_entry_t)); /* */ +extern void FASTCALL(put_swap_pte_chain_map(swp_entry_t)); /* */ +extern void FASTCALL(swap_add_rmap(swp_entry_t, pte_t *)); /* */ +extern void FASTCALL(swap_remove_rmap(swp_entry_t, pte_t *)); /* */ + +/* linux/mm/vmcomm.c */ +extern atomic_t nr_recent_swap_out; /* */ +extern atomic_t_nr_recent_swap_in; /* */ +//extern int send_vm_sig(struct task_struct *, void *, int); /* */ + /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); extern void FASTCALL(__lru_cache_del(struct page *)); @@ -133,7 +149,8 @@ extern void delete_from_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *page); extern struct page * lookup_swap_cache(swp_entry_t); -extern struct page * read_swap_cache_async(swp_entry_t); +/* extern struct page * read_swap_cache_async(swp_entry_t); */ +extern struct page * read_swap_cache_async(swp_entry_t, int *); /* added a parameter to return fault type */ /* linux/mm/oom_kill.c */ extern void out_of_memory(void); @@ -199,6 +216,14 @@ inc_nr_inactive_pages(page); \ } while (0) +/* macro used by vm_relinquish */ +#define add_page_to_inactive_list_tail(page) \ +do { \ + DEBUG_LRU_PAGE(page); \ + list_add(&(page)->lru, inactive_list.prev); \ + nr_inactive_pages++; \ +} while (0) + #define del_page_from_active_list(page) \ do { \ list_del(&(page)->lru); \ diff -Nur linux-2.4.24/include/linux/umlnewsyscalls.h linux-2.4.24-vmcomm/include/linux/umlnewsyscalls.h --- linux-2.4.24/include/linux/umlnewsyscalls.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.4.24-vmcomm/include/linux/umlnewsyscalls.h 2004-07-17 16:50:22.000000000 -0400 @@ -0,0 +1,12 @@ +/* to use under UML (for some odd reason), include this file in your source code when you use the new vm system calls */ +#ifndef __LINUX_UMLNEWSYSCALLS_H +#define __LINUX_UMLNEWSYSCALLS_H + +#include +#include + +_syscall1(long, vm_register, unsigned int, flag) +_syscall2(long, vm_relinquish, void **, pages, unsigned int, n) +_syscall3(long, vm_getswaprate, int, n, int *, swap_out, int *, swap_in) + +#endif diff -Nur linux-2.4.24/include/linux/vmcomm.h linux-2.4.24-vmcomm/include/linux/vmcomm.h --- linux-2.4.24/include/linux/vmcomm.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.4.24-vmcomm/include/linux/vmcomm.h 2004-07-17 16:50:22.000000000 -0400 @@ -0,0 +1,21 @@ +/* */ +#ifndef __LINUX_VMCOMM_H +#define __LINUX_VMCOMM_H + +//#include +#include +#include + +_syscall1(long, vm_register, unsigned int, flag) +_syscall2(long, vm_relinquish, void **, pages, unsigned int, n) +_syscall3(long, vm_getswaprate, int, n, int *, swap_out, int *, swap_in) + +/* +_syscall0(long, vm_donothing) +_syscall3(long, vm_copydata, void *, to, void *, from, unsigned, n) +_syscall2(long, vm_2, void *, a, void *, b) +_syscall1(long, vm_1, void *, a) +_syscall0(long, vm_0) +*/ + +#endif diff -Nur linux-2.4.24/init/main.c linux-2.4.24-vmcomm/init/main.c --- linux-2.4.24/init/main.c 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/init/main.c 2004-07-17 16:50:22.000000000 -0400 @@ -99,6 +99,8 @@ extern void sysctl_init(void); extern void signals_init(void); extern int init_pcmcia_ds(void); +extern void pte_chain_init(void); /* LWRMAP */ +extern void swap_hist_init(void); /* defined in vmcomm.c */ extern void free_initmem(void); @@ -402,6 +404,8 @@ mem_init(); kmem_cache_sizes_init(); pgtable_cache_init(); + pte_chain_init(); /* LWRMAP */ + swap_hist_init(); /* initiates swap rate history statistics */ /* * For architectures that have highmem, num_mappedpages represents diff -Nur linux-2.4.24/kernel/fork.c linux-2.4.24-vmcomm/kernel/fork.c --- linux-2.4.24/kernel/fork.c 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/kernel/fork.c 2004-07-17 16:50:22.000000000 -0400 @@ -235,6 +235,7 @@ mm->page_table_lock = SPIN_LOCK_UNLOCKED; mm->pgd = pgd_alloc(mm); mm->def_flags = 0; + mm->sigproc = NULL; /* by default no process receives the vm signals */ if (mm->pgd) return mm; free_mm(mm); @@ -350,6 +351,12 @@ if (!mm_init(mm)) goto fail_nomem; + /* set sigproc if that of parent is set */ + if (oldmm->sigproc) { + mm->sigproc = tsk; + } + /* end */ + if (init_new_context(tsk,mm)) goto free_pt; diff -Nur linux-2.4.24/Makefile linux-2.4.24-vmcomm/Makefile --- linux-2.4.24/Makefile 2004-01-05 08:53:56.000000000 -0500 +++ linux-2.4.24-vmcomm/Makefile 2004-07-17 16:52:49.000000000 -0400 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 24 -EXTRAVERSION = +EXTRAVERSION = -vmcomm KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -Nur linux-2.4.24/mm/filemap.c linux-2.4.24-vmcomm/mm/filemap.c --- linux-2.4.24/mm/filemap.c 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/mm/filemap.c 2004-07-17 16:50:30.000000000 -0400 @@ -2743,15 +2743,56 @@ static unsigned char mincore_page(struct vm_area_struct * vma, unsigned long pgoff) { + /* this function examines only one page each time, not very efficient */ unsigned char present = 0; - struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; - struct page * page, ** hash = page_hash(as, pgoff); + if (vma->vm_file) { /* mincore works as before with non-anonymous pages */ + struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; + struct page * page, ** hash = page_hash(as, pgoff); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(as, pgoff, *hash); + if ((page) && (Page_Uptodate(page))) + present = 1; + spin_unlock(&pagecache_lock); + } + else { /* and differently with anonymous pages */ + spin_lock(&vma->vm_mm->page_table_lock); + + unsigned long address = vma->vm_start + (pgoff << PAGE_SHIFT); + +#ifdef ERIC_DEBUG + BUG_ON(address >= TASK_SIZE); + BUG_ON(vma->vm_start != vma->vm_start & PAGE_MASK); + BUG_ON(address != address & PAGE_MASK); +#endif + + pgd_t * dir = pgd_offset(current->mm, address); + BUG_ON(!dir); + + pmd_t * pmd = pmd_offset(dir, address); + BUG_ON(!pmd); + + if (pmd_present(*pmd)) { + pte_t * pte = pte_offset(pmd, address); + BUG_ON(!pte); - spin_lock(&pagecache_lock); - page = __find_page_nolock(as, pgoff, *hash); - if ((page) && (Page_Uptodate(page))) - present = 1; - spin_unlock(&pagecache_lock); + if (pte_present(*pte)) { + present = 1; + } + else if (!pte_none(*pte)) { + swp_entry_t entry = pte_to_swp_entry(*pte); + struct page * page, ** hash = page_hash(&swapper_space, entry.val); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(&swapper_space, entry.val, *hash); + if ((page) && (Page_Uptodate(page))) + present = 1; + spin_unlock(&pagecache_lock); + } + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + /* end */ return present; } @@ -2762,9 +2803,12 @@ long error, i, remaining; unsigned char * tmp; + /* enable mincore to work with anonymous pages error = -ENOMEM; if (!vma->vm_file) return error; + */ + BUG_ON(!vma->vm_file && vma->vm_pgoff); /* make sure vm_pgoff is 0 when it's not file-backed, as we share the code below */ start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) diff -Nur linux-2.4.24/mm/Makefile linux-2.4.24-vmcomm/mm/Makefile --- linux-2.4.24/mm/Makefile 2002-08-02 20:39:46.000000000 -0400 +++ linux-2.4.24-vmcomm/mm/Makefile 2004-07-17 16:50:30.000000000 -0400 @@ -11,10 +11,13 @@ export-objs := shmem.o filemap.o memory.o page_alloc.o +# LWRMAP: Added rmap.o module. obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o rmap.o +# added vmcomm.o +obj-y += vmcomm.o obj-$(CONFIG_HIGHMEM) += highmem.o diff -Nur linux-2.4.24/mm/memory.c linux-2.4.24-vmcomm/mm/memory.c --- linux-2.4.24/mm/memory.c 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/mm/memory.c 2004-07-17 16:50:30.000000000 -0400 @@ -47,6 +47,7 @@ #include #include +#include /* LWRMAP */ #include #include @@ -103,6 +104,7 @@ } pte = pte_offset(dir, 0); pmd_clear(dir); + pgtable_remove_rmap(pte); /* LWRMAP */ pte_free(pte); } @@ -237,12 +239,21 @@ if (pte_none(pte)) goto cont_copy_pte_range_noset; + /* LWRMAP: pte contains position in + * swap, so copy. */ if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); - goto cont_copy_pte_range; + /* LWRMAP: Used to goto + * cont_copy_pte_range. We + * perform the set_pte here to + * avoid page_add_rmap() now + * performed at that label. */ + set_pte(dst_pte, pte); + swap_add_rmap(pte_to_swp_entry(pte), dst_pte); /* duplicate the reverse mapping from the swap entry as well */ + goto cont_copy_pte_range_noset; } ptepage = pte_page(pte); - if ((!VALID_PAGE(ptepage)) || + if ((!VALID_PAGE(ptepage)) || PageReserved(ptepage)) goto cont_copy_pte_range; @@ -260,6 +271,8 @@ dst->rss++; cont_copy_pte_range: set_pte(dst_pte, pte); + /* LWRMAP */ + page_add_rmap(ptepage, dst_pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out_unlock; @@ -315,11 +328,15 @@ continue; if (pte_present(pte)) { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && !PageReserved(page)) + if (VALID_PAGE(page) && !PageReserved(page)) { freed ++; + /* LWRMAP */ + page_remove_rmap(page, ptep); + } /* This will eventually call __free_pte on the pte. */ tlb_remove_page(tlb, ptep, address + offset); } else { + swap_remove_rmap(pte_to_swp_entry(pte), ptep); /* remove the reserve mapping from the swap entry as well */ free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(ptep); } @@ -983,7 +1000,12 @@ if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; + /* I didn't modify here because no swap entry is involved */ + /* LWRMAP */ + page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + /* LWRMAP */ + page_add_rmap(new_page, page_table); lru_cache_add(new_page); /* Free the old page.. */ @@ -1090,7 +1112,8 @@ * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... */ -void swapin_readahead(swp_entry_t entry) +/* void swapin_readahead(swp_entry_t entry) */ +void swapin_readahead(swp_entry_t entry, int * retvec) /* added a parameter to store fault type in a vector allocated by caller */ { int i, num; struct page *new_page; @@ -1100,16 +1123,147 @@ * Get the number of handles we should do readahead io to. */ num = valid_swaphandles(entry, &offset); + + /* first store offset and number of enties */ + int * p = retvec; + if (p) { + *(p++) = offset; + *(p++) = num; + } + new_page = NULL; + /* end */ + for (i = 0; i < num; offset++, i++) { /* Ok, do the async read-ahead now */ - new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); + /* new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); */ + + /* then store fault type of each page read */ + if (p) { + new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), p); + p++; + } + else { + new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), NULL); + } +#ifdef ERIC_DEBUG + if (new_page) { + BUG_ON(p && !*(p - 1)); /* the fault type should be set */ + } + else { + BUG_ON(p && *(p - 1)); /* the fault type should be 0 */ + BUG(); /* temporarily */ + } +#endif + /* end */ + if (!new_page) break; page_cache_release(new_page); } + + /* start */ +#ifdef ERIC_DEBUG + BUG_ON(p && (unsigned long)p != (unsigned long)retvec + 8 + 4 * num); +#endif + if (!new_page) { + retvec[1] = i; + } + /* end */ + return; } +/* notify the process(es) (through reverse mapping) of the swapping events */ +void notify_proc(swp_entry_t orig_entry, int * infovec, int info) +{ + swp_entry_t swap; + unsigned long offset; + int num, i, fault_type; + + if (infovec) { + offset = infovec[0]; + num = infovec[1]; + +#ifdef ERIC_DEBUG + BUG_ON(num > (1 << page_cluster) || num < 0); +#endif + + for (i = 0; i < num; i++, offset++) { + fault_type = infovec[i + 2]; + +#ifdef ERIC_DEBUG + BUG_ON(fault_type != 1 && fault_type != 2); /* otherwise we shouldn't have reached here through num */ +#endif + + if (fault_type != 2) { /* no need to notify process(es) if the fault is minor */ + continue; + } + swap = SWP_ENTRY(SWP_TYPE(orig_entry), offset); + + /* notify interested process(es) that the page is swapped in */ + struct pte_chain ** pcp; + pcp = get_swap_pte_chain_map(swap); /* lock sdev_lock here */ + +#ifdef ERIC_DEBUG + BUG_ON(!pcp); +#endif + + struct pte_chain * pc = *pcp; + pte_t * ptep; + struct mm_struct * mm; + unsigned long address; + while (pc != NULL) { /* scan through all reverse mappings of the swap entry */ + ptep = pc->ptep; + mm = ptep_to_mm(ptep); + address = ptep_to_address(ptep); + if (mm->sigproc && (mm->sigflag & VM_SWAPPED_IN)) { /* send VM signal to interested process that the page is swapped in */ + send_vm_sig(mm->sigproc, (void *)address, VM_SWAPPED_IN); + } + pc = pc->next; + } + + put_swap_pte_chain_map(swap); /* unlock sdev_lock here */ + /* end */ + } + } + + if (info == 2) { /* this can only happen when the readahead failed somehow if readahead was performed, or when the swap page is try_to_unuse'd */ + +// offset = infovec[0]; +//#ifdef ERIC_DEBUG +// int distance = SWP_OFFSET(orig_entry) - offset; +// BUG_ON(distance < 0 || distance >= (1 << page_cluster)); +// //BUG_ON(distance < num && infovec[distance + 2] != 0); /* this is sometimes triggered, don't know why */ +//#endif + + + /* notify interested process(es) that the page is swapped in */ + struct pte_chain ** pcp; + pcp = get_swap_pte_chain_map(orig_entry); /* lock sdev_lock here */ + +#ifdef ERIC_DEBUG + BUG_ON(!pcp); +#endif + + struct pte_chain * pc = *pcp; + pte_t * ptep; + struct mm_struct * mm; + unsigned long address; + while (pc != NULL) { /* scan through all reverse mappings of the swap entry */ + ptep = pc->ptep; + mm = ptep_to_mm(ptep); + address = ptep_to_address(ptep); + if (mm->sigproc && (mm->sigflag & VM_SWAPPED_IN)) { /* send VM signal to interested process that the page is swapped in */ + send_vm_sig(mm->sigproc, (void *)address, VM_SWAPPED_IN); + } + pc = pc->next; + } + + put_swap_pte_chain_map(orig_entry); /* unlock sdev_lock here */ + /* end */ + } +} + /* * We hold the mm semaphore and the page_table_lock on entry and * should release the pagetable lock on exit.. @@ -1123,11 +1277,18 @@ pte_t pte; int ret = 1; + /* storage for fault type */ + int infovecsize = (1 << page_cluster) + 2; + int infovec[infovecsize]; /* vector to store fault types of prefetched entries */ + int info = 0; /* yet another int to store fault type of the exact page since it's read twice */ + /* end */ + spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { - swapin_readahead(entry); - page = read_swap_cache_async(entry); + memset(infovec, 0, infovecsize); /* clear vector first */ + swapin_readahead(entry, infovec); /* store fault types of the readahead pages */ + page = read_swap_cache_async(entry, &info); /* store fault type of the exact page */ if (!page) { /* * Back out if somebody else faulted in this pte while @@ -1137,17 +1298,35 @@ spin_lock(&mm->page_table_lock); retval = pte_same(*page_table, orig_pte) ? -1 : 1; spin_unlock(&mm->page_table_lock); + + /* info must be 0 when page is NULL */ +#ifdef ERIC_DEBUG + BUG_ON(info); +#endif + notify_proc(entry, infovec, info); + /* end */ + return retval; } /* Had to read the page from swap area: Major fault */ ret = 2; + +#ifdef ERIC_DEBUG + BUG_ON(info != 1 && info != 2); /* info can't be 0 since page is not NULL */ +#endif } mark_page_accessed(page); lock_page(page); + /* here is probably the best place to notify processes (instead of right after swapin_readahead) since all page io operations are likely to have finished now */ + if (ret == 2) { /* notify the processes only if the page actually caused a major fault, otherwise no readahead happened */ + notify_proc(entry, infovec, info); + } + /* end */ + /* * Back out if somebody else faulted in this pte while we * released the page table lock. @@ -1162,6 +1341,7 @@ /* The page isn't present yet, go ahead with the fault. */ + swap_remove_rmap(entry, page_table); /* hand over the reverse mapping to the page descriptor since the page is swapped in */ swap_free(entry); if (vm_swap_full()) remove_exclusive_swap_page(page); @@ -1175,6 +1355,8 @@ flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + /* LWRMAP */ + page_add_rmap(page, page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); @@ -1190,13 +1372,18 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { pte_t entry; + /* LWRMAP: Declare and define here. See removed declaration + * of page below, in conditional block. */ + struct page* page = ZERO_PAGE(addr); /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); /* ..except if it's a write access */ if (write_access) { - struct page *page; + /* LWRAP: Moved above to support page_add_rmap() call + * after this block. */ + /* struct page *page; */ /* Allocate our own private page. */ spin_unlock(&mm->page_table_lock); @@ -1220,6 +1407,9 @@ } set_pte(page_table, entry); + /* LWRMAP */ + page_add_rmap(page, page_table); /* ignores ZERO_PAGE */ + /* what does the above "ignores ZERO_PAGE" mean? */ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); @@ -1295,6 +1485,8 @@ if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); set_pte(page_table, entry); + /* LWRMAP */ + page_add_rmap(new_page, page_table); } else { /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); @@ -1454,6 +1646,8 @@ goto out; } } + /* LWRMAP */ + pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); } out: diff -Nur linux-2.4.24/mm/mremap.c linux-2.4.24-vmcomm/mm/mremap.c --- linux-2.4.24/mm/mremap.c 2004-01-05 08:53:56.000000000 -0500 +++ linux-2.4.24-vmcomm/mm/mremap.c 2004-07-17 16:50:30.000000000 -0400 @@ -61,8 +61,17 @@ { int error = 0; pte_t pte; + /* LWRMAP */ + struct page* page = NULL; + + /* LWRMAP */ + if (pte_present(*src)) + page = pte_page(*src); if (!pte_none(*src)) { + /* LWRMAP */ + if (page) + page_remove_rmap(page, src); pte = ptep_get_and_clear(src); if (!dst) { /* No dest? We must put it back. */ @@ -70,6 +79,8 @@ error++; } set_pte(dst, pte); + if (page) + page_add_rmap(page, dst); } return error; } diff -Nur linux-2.4.24/mm/page_alloc.c linux-2.4.24-vmcomm/mm/page_alloc.c --- linux-2.4.24/mm/page_alloc.c 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/mm/page_alloc.c 2004-07-17 16:50:30.000000000 -0400 @@ -109,6 +109,9 @@ BUG(); if (PageActive(page)) BUG(); + /* LWRMAP */ + if (page->pte_chain) + BUG(); ClearPageReferenced(page); ClearPageDirty(page); diff -Nur linux-2.4.24/mm/rmap.c linux-2.4.24-vmcomm/mm/rmap.c --- linux-2.4.24/mm/rmap.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.4.24-vmcomm/mm/rmap.c 2004-07-17 16:50:30.000000000 -0400 @@ -0,0 +1,407 @@ +/* reverse mapping from swap slot entry to pte is also added */ + +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + * + * + * Simple, low overhead pte-based reverse mapping scheme. + * This is kept modular because we may want to experiment + * with object-based reverse mapping schemes. Please try + * to keep this thing as modular as possible. + */ + +/* LWRMAP: This entire file is necessary, but parts of it have been + * removed from the source rmap patch. */ + +/* + * Locking: + * - the page->pte_chain is protected by the PG_chainlock bit, + * which nests within the lru lock, then the + * mm->page_table_lock, and then the page lock. + * - because swapout locking is opposite to the locking order + * in the page fault path, the swapout path uses trylocks + * on the mm->page_table_lock + */ +#include +#include +#include +#include +#include + +#include +#include +#include + +/* #define DEBUG_RMAP */ +#ifdef ERIC_DEBUG /* */ +#define DEBUG_RMAP +#endif + +/* + * Shared pages have a chain of pte_chain structures, used to locate + * all the mappings to this page. We only need a pointer to the pte + * here, the page struct for the page table page contains the process + * it belongs to and the offset within that process. + * + * A singly linked list should be fine for most, if not all, workloads. + * On fork-after-exec the mapping we'll be removing will still be near + * the start of the list, on mixed application systems the short-lived + * processes will have their mappings near the start of the list and + * in systems with long-lived applications the relative overhead of + * exit() will be lower since the applications are long-lived. + */ +/* struct pte_chain is moved to mm.h and no longer internal to rmap.c +struct pte_chain { + struct pte_chain * next; + pte_t * ptep; +}; +*/ + +static kmem_cache_t * pte_chain_cache; +static inline struct pte_chain * pte_chain_alloc(void); +/* static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, + struct page *); */ +static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *, struct pte_chain **); /* added a parameter */ + +/* lock is not held if error occurs */ +struct pte_chain ** get_swap_pte_chain_map(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + type = SWP_TYPE(entry); +#ifdef ERIC_DEBUG + if (type >= nr_swapfiles) { + printk(KERN_ERR "get_swap_pte_chain_map: bad swap file entry %08lx\n", entry.val); + BUG(); + return NULL; + } +#endif + offset = SWP_OFFSET(entry); + p = type + swap_info; + swap_device_lock(p); /* sdev_lock protects the pte chain field */ +#ifdef ERIC_DEBUG + if (offset >= p->max || !p->swap_map[offset]) { + printk(KERN_ERR "get_swap_pte_chain_map: bad swap offset entry %08lx\n", entry.val); + swap_device_unlock(p); + BUG(); + return NULL; + } +#endif + return &p->pte_chain_map[offset]; +} + +/* */ +void put_swap_pte_chain_map(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + type = SWP_TYPE(entry); +#ifdef ERIC_DEBUG + if (type >= nr_swapfiles) { + printk(KERN_ERR "put_swap_pte_chain_map: bad swap file entry %08lx\n", entry.val); + BUG(); + return; + } +#endif + offset = SWP_OFFSET(entry); + p = type + swap_info; +#ifdef ERIC_DEBUG + if (offset >= p->max || !p->swap_map[offset]) { + printk(KERN_ERR "put_swap_pte_chain_map: bad swap offset entry %08lx\n", entry.val); + BUG(); + return; + } +#endif + swap_device_unlock(p); +} + +/* swap_add_rmap adds a reverse mapping entry to the pte from the swap entry + * uses swap_info_struct->sdev_lock (may not be ideal) + * the caller needs to hold mm->page_table_lock + */ +void swap_add_rmap(swp_entry_t entry, pte_t * ptep) +{ + struct pte_chain * pte_chain; + +#ifdef ERIC_DEBUG + if (!ptep) + BUG(); + if (pte_present(*ptep)) + BUG(); + if (pte_none(*ptep)) + BUG(); + if (!ptep_to_mm(ptep)) + BUG(); +#endif + + struct swap_info_struct * p; + unsigned long offset, type; + type = SWP_TYPE(entry); +#ifdef ERIC_DEBUG + if (type >= nr_swapfiles) { + printk(KERN_ERR "swap_add_rmap: bad swap file entry %08lx\n", entry.val); + BUG(); + return; + } +#endif + offset = SWP_OFFSET(entry); + p = type + swap_info; + swap_device_lock(p); /* sdev_lock protects the pte chain field */ +#ifdef ERIC_DEBUG + if (offset >= p->max || !p->swap_map[offset]) { + printk(KERN_ERR "swap_add_rmap: bad swap offset entry %08lx\n", entry.val); + swap_device_unlock(p); + BUG(); + return; + } +#endif + +#ifdef ERIC_DEBUG + struct pte_chain * pc; + for (pc = p->pte_chain_map[offset]; pc; pc = pc->next) { + if (pc->ptep == ptep) + BUG(); + } +#endif + + pte_chain = pte_chain_alloc(); + + /* Hook up the pte_chain to the page. */ + pte_chain->ptep = ptep; + pte_chain->next = p->pte_chain_map[offset]; + p->pte_chain_map[offset] = pte_chain; + + swap_device_unlock(p); +} + +/* swap_remove_rmap removes a reverse mapping entry to the pte from the swap entry + * uses swap_info_struct->sdev_lock (may not be ideal) + * the caller needs to hold mm->page_table_lock + */ +void swap_remove_rmap(swp_entry_t entry, pte_t * ptep) +{ + struct pte_chain * pc, * prev_pc = NULL; + +#ifdef ERIC_DEBUG + if (!ptep) + BUG(); + if (pte_present(*ptep)) + BUG(); + if (pte_none(*ptep)) + BUG(); +#endif + + struct swap_info_struct * p; + unsigned long offset, type; + type = SWP_TYPE(entry); +#ifdef ERIC_DEBUG + if (type >= nr_swapfiles) { + printk(KERN_ERR "swap_add_rmap: bad swap file entry %08lx\n", entry.val); + BUG(); + return; + } +#endif + offset = SWP_OFFSET(entry); + p = type + swap_info; + swap_device_lock(p); /* sdev_lock protects the pte chain field */ +#ifdef ERIC_DEBUG + if (offset >= p->max || !p->swap_map[offset]) { + printk(KERN_ERR "swap_add_rmap: bad swap offset entry %08lx\n", entry.val); + swap_device_unlock(p); + BUG(); + return; + } +#endif + + for (pc = p->pte_chain_map[offset]; pc; prev_pc = pc, pc = pc->next) { + if (pc->ptep == ptep) { + pte_chain_free(pc, prev_pc, NULL, &p->pte_chain_map[offset]); + goto out; + } + } +#ifdef ERIC_DEBUG + /* Not found. This should NEVER happen! */ + printk(KERN_ERR "swap_remove_rmap: pte_chain %p not present.\n", ptep); + printk(KERN_ERR "swap_remove_rmap: only found: "); + for (pc = p->pte_chain_map[offset]; pc; pc = pc->next) + printk("%p ", pc->ptep); + printk("\n"); + printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n"); + BUG(); +#endif + +out: + swap_device_unlock(p); + return; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @ptep: the page table entry mapping this page + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +void page_add_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pte_chain; + + +#ifdef DEBUG_RMAP + if (!page || !ptep) + BUG(); + if (!pte_present(*ptep)) + BUG(); + if (!ptep_to_mm(ptep)) + BUG(); +#endif + + if (!VALID_PAGE(page) || PageReserved(page)) + return; + +#ifdef DEBUG_RMAP + pte_chain_lock(page); + { + struct pte_chain * pc; + for (pc = page->pte_chain; pc; pc = pc->next) { + if (pc->ptep == ptep) + BUG(); + } + } + pte_chain_unlock(page); +#endif + + pte_chain = pte_chain_alloc(); + + pte_chain_lock(page); + + /* Hook up the pte_chain to the page. */ + pte_chain->ptep = ptep; + pte_chain->next = page->pte_chain; + page->pte_chain = pte_chain; + + pte_chain_unlock(page); +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * @ptep: page table entry to remove + * + * Removes the reverse mapping from the pte_chain of the page, + * after that the caller can clear the page table entry and free + * the page. + * Caller needs to hold the mm->page_table_lock. + */ +void page_remove_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pc, * prev_pc = NULL; + +#ifdef DEBUG_RMAP /* */ + if (!page || !ptep) + BUG(); +#endif + + if (!VALID_PAGE(page) || PageReserved(page)) + return; + + pte_chain_lock(page); + for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) { + if (pc->ptep == ptep) { + pte_chain_free(pc, prev_pc, page, NULL); + goto out; + } + } +#ifdef DEBUG_RMAP + /* Not found. This should NEVER happen! */ + printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep); + printk(KERN_ERR "page_remove_rmap: only found: "); + for (pc = page->pte_chain; pc; pc = pc->next) + printk("%p ", pc->ptep); + printk("\n"); + printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n"); +#endif + +out: + pte_chain_unlock(page); + return; + +} + +/** + ** No more VM stuff below this comment, only pte_chain helper + ** functions. + **/ + +/** + * pte_chain_free - free pte_chain structure + * @pte_chain: pte_chain struct to free + * @prev_pte_chain: previous pte_chain on the list (may be NULL) + * @page: page this pte_chain hangs off (may be NULL) + * + * This function unlinks pte_chain from the singly linked list it + * may be on and adds the pte_chain to the free list. May also be + * called for new pte_chain structures which aren't on any list yet. + * Caller needs to hold the pte_chain_lock if the page is non-NULL. + */ +/* static inline void pte_chain_free(struct pte_chain * pte_chain, + struct pte_chain * prev_pte_chain, struct page * page) */ +static inline void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page, struct pte_chain ** swap_pc) /* added swap_pc */ +{ + if (prev_pte_chain) + prev_pte_chain->next = pte_chain->next; + else if (page) + page->pte_chain = pte_chain->next; + else if (swap_pc) + *swap_pc = pte_chain->next; + + kmem_cache_free(pte_chain_cache, pte_chain); +} + +/** + * pte_chain_alloc - allocate a pte_chain struct + * + * Returns a pointer to a fresh pte_chain structure. Allocates new + * pte_chain structures as required. + * Caller needs to hold the page's pte_chain_lock. + */ +static inline struct pte_chain * pte_chain_alloc(void) +{ + struct pte_chain * pte_chain; + + pte_chain = kmem_cache_alloc(pte_chain_cache, GFP_ATOMIC); + + /* I don't think anybody managed to trigger this one -- Rik */ + if (unlikely(pte_chain == NULL)) + panic("fix pte_chain OOM handling\n"); + + return pte_chain; +} + +void __init pte_chain_init(void) +{ + pte_chain_cache = kmem_cache_create( "pte_chain", + sizeof(struct pte_chain), + 0, + 0, + NULL, + NULL); + + if (!pte_chain_cache) + panic("failed to create pte_chain cache!\n"); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nur linux-2.4.24/mm/shmem.c linux-2.4.24-vmcomm/mm/shmem.c --- linux-2.4.24/mm/shmem.c 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/mm/shmem.c 2004-07-17 16:50:30.000000000 -0400 @@ -656,8 +656,8 @@ swappage = lookup_swap_cache(swap); if (!swappage) { spin_unlock(&info->lock); - swapin_readahead(swap); - swappage = read_swap_cache_async(swap); + swapin_readahead(swap, NULL); /* added NULL to be compatible with changed function interface */ + swappage = read_swap_cache_async(swap, NULL); /* added NULL to be compatible with changed function interface */ if (!swappage) { spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); diff -Nur linux-2.4.24/mm/swapfile.c linux-2.4.24-vmcomm/mm/swapfile.c --- linux-2.4.24/mm/swapfile.c 2003-08-25 07:44:44.000000000 -0400 +++ linux-2.4.24-vmcomm/mm/swapfile.c 2004-07-17 16:50:30.000000000 -0400 @@ -17,6 +17,10 @@ #include +#include /* */ +extern struct timer_list swap_hist_timer; /* defined in vmcomm.c */ +extern void update_swap_hist(unsigned long); /* defined in vmcomm.c */ + spinlock_t swaplock = SPIN_LOCK_UNLOCKED; unsigned int nr_swapfiles; int total_swap_pages; @@ -371,10 +375,14 @@ return; if (unlikely(pte_none(pte) || pte_present(pte))) return; + swap_remove_rmap(entry, dir); /* hand over the reverse mapping to the page descriptor */ get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + /* LWRMAP */ + page_add_rmap(page, dir); swap_free(entry); ++vma->vm_mm->rss; + /* should we also notify interested process(es) here that the page is swapped in? */ } /* mmlist_lock and vma->vm_mm->page_table_lock are held */ @@ -522,6 +530,7 @@ int retval = 0; int reset_overflow = 0; int shmem; + int faulttype = 0; /* indicates whether the page is read from disk or found in cache */ /* * When searching mms for an entry, a good strategy is to @@ -561,7 +570,8 @@ */ swap_map = &si->swap_map[i]; entry = SWP_ENTRY(type, i); - page = read_swap_cache_async(entry); + /* page = read_swap_cache_async(entry); */ + page = read_swap_cache_async(entry, &faulttype); if (!page) { /* * Either swap_duplicate() failed because entry @@ -595,6 +605,10 @@ wait_on_page(page); lock_page(page); + if (faulttype == 2) { /* notify the process(es) if the page was read from swap disk */ + notify_proc(entry, NULL, faulttype); + } + /* * Remove all references to entry, without blocking. * Whenever we reach init_mm, there's no address space @@ -793,13 +807,25 @@ p->swap_vfsmnt = NULL; p->swap_file = NULL; p->swap_device = 0; +#ifdef ERIC_DEBUG + for (i = 0; i < p->max; i++) { /* the pte_chain_map entries should have been cleared by now */ + BUG_ON(p->pte_chain_map[i]); + } +#endif p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; + struct pte_chain ** pte_chain_map = p->pte_chain_map; /* store the pte_chain_map field temporarily in order to free it later */ + p->pte_chain_map = NULL; /* */ p->flags = 0; swap_device_unlock(p); swap_list_unlock(); +#ifdef ERIC_DEBUG + BUG_ON(!swap_map); /* */ + BUG_ON(!pte_chain_map); /* */ +#endif vfree(swap_map); + vfree(pte_chain_map); /* free the reverse mapping array (individual entries should have been freed) */ err = 0; out_dput: @@ -902,6 +928,7 @@ p->swap_vfsmnt = NULL; p->swap_device = 0; p->swap_map = NULL; + p->pte_chain_map = NULL; /* */ p->lowest_bit = 0; p->highest_bit = 0; p->cluster_nr = 0; @@ -1006,7 +1033,8 @@ } nr_good_pages = j; p->swap_map = vmalloc(maxpages * sizeof(short)); - if (!p->swap_map) { + p->pte_chain_map = vmalloc(maxpages * sizeof(struct pte_chain *)); /* allocate the reverse mapping array */ + if (!p->swap_map || !p->pte_chain_map) { /* */ error = -ENOMEM; goto bad_swap; } @@ -1016,6 +1044,7 @@ else p->swap_map[i] = SWAP_MAP_BAD; } + memset(p->pte_chain_map, 0, maxpages * sizeof(struct pte_chain *)); /* initiate the reverse mapping array */ break; case 2: @@ -1040,13 +1069,14 @@ goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + if (!(p->swap_map = vmalloc(maxpages * sizeof(short))) || !(p->pte_chain_map = vmalloc(maxpages * sizeof(struct pte_chain *)))) { /* allocate the reverse mapping array */ error = -ENOMEM; goto bad_swap; } error = 0; memset(p->swap_map, 0, maxpages * sizeof(short)); + memset(p->pte_chain_map, 0, maxpages * sizeof(struct pte_chain *)); /* initiate the reverse mapping array */ for (i=0; iinfo.nr_badpages; i++) { int page = swap_header->info.badpages[i]; if (page <= 0 || page >= swap_header->info.last_page) @@ -1113,16 +1143,30 @@ p->swap_file = NULL; p->swap_vfsmnt = NULL; p->swap_map = NULL; + struct pte_chain ** pte_chain_map = p->pte_chain_map; /* store the pte_chain_map field temporarily in order to free it later */ + p->pte_chain_map = NULL; /* */ p->flags = 0; if (!(swap_flags & SWAP_FLAG_PREFER)) ++least_priority; swap_list_unlock(); if (swap_map) vfree(swap_map); + if (pte_chain_map) /* free the reverse mapping array */ + vfree(pte_chain_map); path_release(&nd); out: if (swap_header) free_page((long) swap_header); + + /* turn on timer to update swapping statistics periodically - every 1 second currently */ + if (!error) { + init_timer(&swap_hist_timer); + swap_hist_timer.function = update_swap_hist; + swap_hist_timer.expires = jiffies + 1 * HZ; + add_timer(&swap_hist_timer); + } + /* end */ + unlock_kernel(); return error; } diff -Nur linux-2.4.24/mm/swap_state.c linux-2.4.24-vmcomm/mm/swap_state.c --- linux-2.4.24/mm/swap_state.c 2002-11-28 18:53:15.000000000 -0500 +++ linux-2.4.24-vmcomm/mm/swap_state.c 2004-07-17 16:50:30.000000000 -0400 @@ -17,6 +17,8 @@ #include +extern atomic_t nr_recent_swap_in; /* */ + /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. @@ -181,7 +183,8 @@ * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ -struct page * read_swap_cache_async(swp_entry_t entry) +/* struct page * read_swap_cache_async(swp_entry_t entry) */ +struct page * read_swap_cache_async(swp_entry_t entry, int * retp) /* added a parameter to return fault type */ { struct page *found_page, *new_page = NULL; int err; @@ -221,11 +224,23 @@ * Initiate read into locked page and return. */ rw_swap_page(READ, new_page); + atomic_inc(&nr_recent_swap_in); /* */ + if (retp) { /* set return value */ + *retp = 2; /* page is read from swap entry, major fault */ + } return new_page; } } while (err != -ENOENT); if (new_page) page_cache_release(new_page); + + if (retp) { /* set return value */ + if (found_page) /* page is found in swap cache, minor fault */ + *retp = 1; + else + *retp = 0; /* out of memory error */ + } + return found_page; } diff -Nur linux-2.4.24/mm/vmcomm.c linux-2.4.24-vmcomm/mm/vmcomm.c --- linux-2.4.24/mm/vmcomm.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.4.24-vmcomm/mm/vmcomm.c 2004-07-17 16:50:30.000000000 -0400 @@ -0,0 +1,284 @@ +/* this file implements VM communication related routines */ +#include +#include +#include + +#define SWAP_HIST (5) + +atomic_t nr_recent_swap_out; +atomic_t nr_recent_swap_in; +int swap_hist_index; +int swap_out_hist[SWAP_HIST]; +int swap_in_hist[SWAP_HIST]; +struct timer_list swap_hist_timer; + +extern int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone); + + +void __init swap_hist_init(void) +{ + int i; + atomic_set(&nr_recent_swap_out, 0); + atomic_set(&nr_recent_swap_in, 0); + swap_hist_index = 0; + for (i = 0; i < SWAP_HIST; i++) { + swap_out_hist[i] = 0; + swap_in_hist[i] = 0; + } +} + + +void update_swap_hist(unsigned long unused) +{ + int out, in; + BUG_ON(swap_hist_index < 0 || swap_hist_index >= SWAP_HIST); + swap_hist_index++; + if (swap_hist_index == SWAP_HIST) { + swap_hist_index = 0; + } + BUG_ON(swap_hist_index < 0 || swap_hist_index >= SWAP_HIST); +/* + xchg(&nr_recent_swap_out, 0); + xchg(&nr_recent_swap_in, 0); +*/ + /* possible loss of count between read and set, should be better if we use an atomic swap operation */ + out = atomic_read(&nr_recent_swap_out); + atomic_set(&nr_recent_swap_out, 0); + in = atomic_read(&nr_recent_swap_in); + atomic_set(&nr_recent_swap_in, 0); + + swap_out_hist[swap_hist_index] = out; + swap_in_hist[swap_hist_index] = in; + del_timer(&swap_hist_timer); + swap_hist_timer.expires = jiffies + 1 * HZ; + add_timer(&swap_hist_timer); +} + + +int send_vm_sig(struct task_struct * t, void * addr, int flag) +{ + siginfo_t info; + +#ifdef ERIC_DEBUG + BUG_ON(!t); + BUG_ON(!addr || (unsigned long)addr & ~PAGE_MASK); + BUG_ON(flag != VM_SWAPPING_OUT && flag != VM_SWAPPED_OUT && flag != VM_SWAPPED_IN); +#endif + +#ifdef ERIC_DEBUG + static int a = 0, b = 0, c = 0; + switch (flag) { + case VM_SWAPPING_OUT: + info.si_uid = ++a; + break; + case VM_SWAPPED_OUT: + info.si_uid = ++b; + break; + case VM_SWAPPED_IN: + info.si_uid = ++c; + break; + default: + BUG(); + } +#endif + + info.si_signo = SIGVM; + info.si_errno = flag; + info.si_code = SI_KERNEL; + info.si_addr = addr; + send_sig_info(SIGVM, &info, t); + return 0; +} + + +/* flag 0x1 for VM_SWAPPING_OUT, 0x2 for VM_SWAPPED_OUT, 0x4 for VM_SWAPPED_IN */ +asmlinkage long sys_vm_register(unsigned long flag) +{ + long error = -EINVAL; + struct mm_struct * mm = current->mm; + + if (flag < 0 || flag > (VM_SWAPPING_OUT | VM_SWAPPED_OUT | VM_SWAPPED_IN)) { + return errno; + } + if (flag == 0) { /* unregister */ + if (mm->sigproc == current) { + mm->sigproc = NULL; + return 0; + } + else if (mm->sigproc == NULL) { + error = -ENOENT; + return error; + } + else { + BUG(); + mm->sigproc = NULL; + error = -EPERM; + return error; + } + } + else { /* register */ + if (mm->sigproc == NULL) { + mm->sigproc = current; + mm->sigflag = flag; + return 0; + } + else { + error = -EEXIST; + return error; + } + } +} + + +/* current->mm->mmap_sem is held with read bias */ +static long vm_relinquish_page(void ** pages, size_t n) +{ + long error; + struct mm_struct * mm = current->mm; + long i; + int page_discardable; + for (i = 0; i < n; i++) { + unsigned long page_addr = (unsigned long)pages[i]; /* get a page address */ + + page_discardable = 0; + if (page_addr & 1UL) { /* this is a discardable page */ + page_addr &= ~1UL; + page_discardable = 1; + } + + error = -EINVAL; + if (page_addr & ~PAGE_MASK) /* address not page-aligned */ + return error; + + if (page_discardable) { /* discardable page, release the resource it holds */ + zap_page_range(mm, page_addr, PAGE_SIZE); /* let zap_page_range() handle it */ + } + else { /* non-discardable page, push it to the end of the LRU queue */ + struct vm_area_struct * vma = find_vma(mm, page_addr); + error = -ENOMEM; + if (!vma) /* this page does not exist in any memory region */ + return error; + + error = -EPERM; + if (vma->vm_flags & VM_LOCKED) + return error; /* this page can not be released */ + + spin_lock(&mm->page_table_lock); + + pgd_t * dir = pgd_offset(mm, page_addr); + BUG_ON(!dir); + + pmd_t * pmd = pmd_offset(dir, page_addr); + BUG_ON(!pmd); + + if (pmd_present(*pmd)) { + pte_t * pte = pte_offset(pmd, page_addr); + BUG_ON(!pte); + + if (pte_present(*pte)) { + struct page * page = pte_page(*pte); + + if (VALID_PAGE(page)) { + if (!PageReserved(page) && PageLRU(page)) { + ptep_test_and_clear_young(pte); /* we just want to clear its reference bit */ + + /* push this page down to the end of the LRU queue */ + spin_lock(&pagemap_lru_lock); + + /* remove this page from where it was in the LRU queue */ + if (PageActive(page)) + del_page_from_active_list(page); + else + del_page_from_inactive_list(page); + + /* put it at the end of inactive_list, so it'll be swapped out soon */ + add_page_to_inactive_list_tail(page); + + spin_unlock(&pagemap_lru_lock); + + /* let try_to_swap_out() do the rest of the job */ + /* we prepared the page status carefully so try_to_swap_out() has no reason not to swap it out */ + try_to_swap_out(mm, vma, page_addr, pte, page, page_zone(page)); + } + /* otherwise this page is reserved and we ignore it */ + } + else { /* not a valid page */ + error = -EINVAL; + spin_unlock(&mm->page_table_lock); + return error; + } + } + /* otherwise this page is not present, we don't have to release it */ + } + /* otherwise the pmd is not even present - does it mean the page has never been touched? but this is probably not an error */ + + spin_unlock(&mm->page_table_lock); + } + } + return 0; +} + + +/* pages: array of page addresses, whose bit 0 is a flag to say it's discardable or not */ +/* n: number of pages in the array to process */ +asmlinkage long sys_vm_relinquish(void ** pages, unsigned int n) +{ + long error = -EINVAL; + + down_read(¤t->mm->mmap_sem); + + if (n == 0 || n > 1000) /* deal with at most 1000 pages at a time, should be a more meaningful number */ + goto out; + + error = -EAGAIN; + void ** tmp = (void **)__get_free_page(GFP_KERNEL); /* use a temporary page to store page addresses from user */ + if (!tmp) + goto out; + + long remaining = n; + long i; + for (i = 0; remaining > 0; remaining -= PAGE_SIZE / sizeof(void *), i++) { + long thispiece = (remaining < PAGE_SIZE / sizeof(void *)) ? remaining : PAGE_SIZE / sizeof(void *); + if (copy_from_user(tmp, pages + PAGE_SIZE * i, thispiece * sizeof(void *))) { + error = -EFAULT; + break; + } + /* process a page worth page addresses */ + error = vm_relinquish_page(tmp, thispiece); + if (error) + break; + } + + free_page((unsigned long)tmp); /* free the temporary page we used */ + + out: + up_read(¤t->mm->mmap_sem); + return error; +} + + +/* returns numbers of swap out's and in's during the last SWAP_HIST seconds in arrays provided by user */ +asmlinkage long sys_vm_getswaprate(int n, int * swap_out, int * swap_in) +{ + int temp_swap_out[SWAP_HIST]; + int temp_swap_in[SWAP_HIST]; + int i, j, k = 0; + if (n <= 0 || n > SWAP_HIST) + return -EINVAL; + i = swap_hist_index; + BUG_ON(i < 0 || i >= SWAP_HIST); + for (j = i; j >= 0; j--) { + temp_swap_out[k] = swap_out_hist[j]; + temp_swap_in[k] = swap_in_hist[j]; + k++; + } + for (j = SWAP_HIST - 1; j > i; j--) { + temp_swap_out[k] = swap_out_hist[j]; + temp_swap_in[k] = swap_in_hist[j]; + k++; + } + if (copy_to_user(swap_out, temp_swap_out, sizeof(temp_swap_out)) || copy_to_user(swap_in, temp_swap_in, sizeof(temp_swap_in))) { + return -EFAULT; + } + return 0; +} diff -Nur linux-2.4.24/mm/vmscan.c linux-2.4.24-vmcomm/mm/vmscan.c --- linux-2.4.24/mm/vmscan.c 2003-11-28 13:26:21.000000000 -0500 +++ linux-2.4.24-vmcomm/mm/vmscan.c 2004-07-17 16:50:30.000000000 -0400 @@ -26,6 +26,8 @@ #include +#include /* here we have to use some functions from rmap.h */ + /* * "vm_passes" is the number of vm passes before failing the * memory balancing. Take into account 3 passes are needed @@ -74,7 +76,8 @@ */ /* mm->page_table_lock is held. mmap_sem is not held */ -static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) +/* try_to_swap_out is no longer static, because vmcomm.c uses it */ +inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) { pte_t pte; swp_entry_t entry; @@ -118,9 +121,20 @@ swap_duplicate(entry); set_swap_pte: set_pte(page_table, swp_entry_to_pte(entry)); + swap_add_rmap(entry, page_table); /* set the swap reverse mapping since the pte is changed to point to the swap entry */ + + /* the page is unlinked and likely to be swapped out soon, so we send the interested process a signal */ + /* we put the code here instead of below drop_pte because we are now only interested in pages in swap cache, not file-backed pages */ + if (mm->sigproc && (mm->sigflag & VM_SWAPPING_OUT)) { + send_vm_sig(mm->sigproc, (void *)address, VM_SWAPPING_OUT); + } + /* end */ + drop_pte: mm->rss--; UnlockPage(page); + /* LWRMAP */ + page_remove_rmap(page, page_table); { int freeable = page_count(page) - !!page->buffers <= 2; page_cache_release(page); @@ -552,6 +566,32 @@ swap.val = page->index; __delete_from_swap_cache(page); spin_unlock(&pagecache_lock); + + /* send interested process(es) a signal that the page is swapped out */ + struct pte_chain ** pcp; + pcp = get_swap_pte_chain_map(swap); /* lock sdev_lock here */ + +#ifdef ERIC_DEBUG + BUG_ON(!pcp); +#endif + + struct pte_chain * pc = *pcp; + pte_t * ptep; + struct mm_struct * mm; + unsigned long address; + while (pc != NULL) { /* scan through all reverse mappings of the swap entry */ + ptep = pc->ptep; + mm = ptep_to_mm(ptep); + address = ptep_to_address(ptep); + if (mm->sigproc && (mm->sigflag & VM_SWAPPED_OUT)) { /* send VM signal to interested process that the page is swapped out */ + send_vm_sig(mm->sigproc, (void *)address, VM_SWAPPED_OUT); + } + pc = pc->next; + } + + put_swap_pte_chain_map(swap); /* unlock sdev_lock here */ + /* end */ + swap_free(swap); } @@ -560,6 +600,7 @@ /* effectively free the page here */ page_cache_release(page); + atomic_inc(&nr_recent_swap_out); /* */ if (--nr_pages) continue; break;