diff options
author | Daniel Robbins <drobbins@gentoo.org> | 2001-11-16 22:12:25 +0000 |
---|---|---|
committer | Daniel Robbins <drobbins@gentoo.org> | 2001-11-16 22:12:25 +0000 |
commit | 3a7632d575ac3a60c6ca9541d0b48e09c881c38e (patch) | |
tree | 1ee37744646d132a6ddab94faafaa66579a2584c /sys-kernel | |
parent | ifixo (diff) | |
download | gentoo-2-3a7632d575ac3a60c6ca9541d0b48e09c881c38e.tar.gz gentoo-2-3a7632d575ac3a60c6ca9541d0b48e09c881c38e.tar.bz2 gentoo-2-3a7632d575ac3a60c6ca9541d0b48e09c881c38e.zip |
missing stuffs
Diffstat (limited to 'sys-kernel')
3 files changed, 4785 insertions, 0 deletions
diff --git a/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/array.c b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/array.c new file mode 100644 index 000000000000..188ce6b49953 --- /dev/null +++ b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/array.c @@ -0,0 +1,698 @@ +/* + * linux/fs/proc/array.c + * + * Copyright (C) 1992 by Linus Torvalds + * based on ideas by Darren Senn + * + * Fixes: + * Michael. K. Johnson: stat,statm extensions. + * <johnsonm@stolaf.edu> + * + * Pauline Middelink : Made cmdline,envline only break at '\0's, to + * make sure SET_PROCTITLE works. Also removed + * bad '!' which forced address recalculation for + * EVERY character on the current page. + * <middelin@polyware.iaf.nl> + * + * Danny ter Haar : added cpuinfo + * <dth@cistron.nl> + * + * Alessandro Rubini : profile extension. + * <rubini@ipvvis.unipv.it> + * + * Jeff Tranter : added BogoMips field to cpuinfo + * <Jeff_Tranter@Mitel.COM> + * + * Bruno Haible : remove 4K limit for the maps file + * <haible@ma2s2.mathematik.uni-karlsruhe.de> + * + * Yves Arrouye : remove removal of trailing spaces in get_array. + * <Yves.Arrouye@marin.fdn.fr> + * + * Jerome Forissier : added per-CPU time information to /proc/stat + * and /proc/<pid>/cpu extension + * <forissier@isia.cma.fr> + * - Incorporation and non-SMP safe operation + * of forissier patch in 2.1.78 by + * Hans Marcus <crowbar@concepts.nl> + * + * aeb@cwi.nl : /proc/partitions + * + * + * Alan Cox : security fixes. + * <Alan.Cox@linux.org> + * + * Al Viro : safe handling of mm_struct + * + * Gerhard Wichert : added BIGMEM support + * Siemens AG <Gerhard.Wichert@pdb.siemens.de> + * + * Al Viro & Jeff Garzik : moved most of the thing into base.c and + * : proc_misc.c. The rest may eventually go into + * : base.c too. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/tty.h> +#include <linux/string.h> +#include <linux/mman.h> +#include <linux/proc_fs.h> +#include <linux/ioport.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/signal.h> +#include <linux/highmem.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/io.h> +#include <asm/processor.h> + +/* Gcc optimizes away "strlen(x)" for constant x */ +#define ADDBUF(buffer, string) \ +do { memcpy(buffer, string, strlen(string)); \ + buffer += strlen(string); } while (0) + +static inline char * task_name(struct task_struct *p, char * buf) +{ + int i; + char * name; + + ADDBUF(buf, "Name:\t"); + name = p->comm; + i = sizeof(p->comm); + do { + unsigned char c = *name; + name++; + i--; + *buf = c; + if (!c) + break; + if (c == '\\') { + buf[1] = c; + buf += 2; + continue; + } + if (c == '\n') { + buf[0] = '\\'; + buf[1] = 'n'; + buf += 2; + continue; + } + buf++; + } while (i); + *buf = '\n'; + return buf+1; +} + +/* + * The task state array is a strange "bitmap" of + * reasons to sleep. Thus "running" is zero, and + * you can test for combinations of others with + * simple bit tests. + */ +static const char *task_state_array[] = { + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "Z (zombie)", /* 4 */ + "T (stopped)", /* 8 */ + "W (paging)" /* 16 */ +}; + +static inline const char * get_task_state(struct task_struct *tsk) +{ + unsigned int state = tsk->state & (TASK_RUNNING | + TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE | + TASK_ZOMBIE | + TASK_STOPPED); + const char **p = &task_state_array[0]; + + while (state) { + p++; + state >>= 1; + } + return *p; +} + +static inline char * task_state(struct task_struct *p, char *buffer) +{ + int g; + + read_lock(&tasklist_lock); + buffer += sprintf(buffer, + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n", + get_task_state(p), p->tgid, + p->pid, p->pid ? p->p_opptr->pid : 0, 0, + p->uid, p->euid, p->suid, p->fsuid, + p->gid, p->egid, p->sgid, p->fsgid); + read_unlock(&tasklist_lock); + task_lock(p); + buffer += sprintf(buffer, + "FDSize:\t%d\n" + "Groups:\t", + p->files ? p->files->max_fds : 0); + task_unlock(p); + + for (g = 0; g < p->ngroups; g++) + buffer += sprintf(buffer, "%d ", p->groups[g]); + + buffer += sprintf(buffer, "\n"); + return buffer; +} + +static inline char * task_mem(struct mm_struct *mm, char *buffer) +{ + struct vm_area_struct * vma; + unsigned long data = 0, stack = 0; + unsigned long exec = 0, lib = 0; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long len = (vma->vm_end - vma->vm_start) >> 10; + if (!vma->vm_file) { + data += len; + if (vma->vm_flags & VM_GROWSDOWN) + stack += len; + continue; + } + if (vma->vm_flags & VM_WRITE) + continue; + if (vma->vm_flags & VM_EXEC) { + exec += len; + if (vma->vm_flags & VM_EXECUTABLE) + continue; + lib += len; + } + } + buffer += sprintf(buffer, + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB\n", + mm->total_vm << (PAGE_SHIFT-10), + mm->locked_vm << (PAGE_SHIFT-10), + mm->rss << (PAGE_SHIFT-10), + data - stack, stack, + exec - lib, lib); + up_read(&mm->mmap_sem); + return buffer; +} + +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, + sigset_t *catch) +{ + struct k_sigaction *k; + int i; + + sigemptyset(ign); + sigemptyset(catch); + + if (p->sig) { + k = p->sig->action; + for (i = 1; i <= _NSIG; ++i, ++k) { + if (k->sa.sa_handler == SIG_IGN) + sigaddset(ign, i); + else if (k->sa.sa_handler != SIG_DFL) + sigaddset(catch, i); + } + } +} + +static inline char * task_sig(struct task_struct *p, char *buffer) +{ + sigset_t ign, catch; + + buffer += sprintf(buffer, "SigPnd:\t"); + buffer = render_sigset_t(&p->pending.signal, buffer); + *buffer++ = '\n'; + buffer += sprintf(buffer, "SigBlk:\t"); + buffer = render_sigset_t(&p->blocked, buffer); + *buffer++ = '\n'; + + collect_sigign_sigcatch(p, &ign, &catch); + buffer += sprintf(buffer, "SigIgn:\t"); + buffer = render_sigset_t(&ign, buffer); + *buffer++ = '\n'; + buffer += sprintf(buffer, "SigCgt:\t"); /* Linux 2.0 uses "SigCgt" */ + buffer = render_sigset_t(&catch, buffer); + *buffer++ = '\n'; + + return buffer; +} + +static inline char *task_cap(struct task_struct *p, char *buffer) +{ + return buffer + sprintf(buffer, "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n", + cap_t(p->cap_inheritable), + cap_t(p->cap_permitted), + cap_t(p->cap_effective)); +} + + +int proc_pid_status(struct task_struct *task, char * buffer) +{ + char * orig = buffer; + struct mm_struct *mm; + + buffer = task_name(task, buffer); + buffer = task_state(task, buffer); + task_lock(task); + mm = task->mm; + if(mm) + atomic_inc(&mm->mm_users); + task_unlock(task); + if (mm) { + buffer = task_mem(mm, buffer); + mmput(mm); + } + buffer = task_sig(task, buffer); + buffer = task_cap(task, buffer); +#if defined(CONFIG_ARCH_S390) + buffer = task_show_regs(task, buffer); +#endif + return buffer - orig; +} + +int proc_pid_stat(struct task_struct *task, char * buffer) +{ + unsigned long vsize, eip, esp, wchan; + long priority, nice; + int tty_pgrp = -1, tty_nr = 0; + sigset_t sigign, sigcatch; + char state; + int res; + pid_t ppid; + struct mm_struct *mm; + + state = *get_task_state(task); + vsize = eip = esp = 0; + task_lock(task); + mm = task->mm; + if(mm) + atomic_inc(&mm->mm_users); + if (task->tty) { + tty_pgrp = task->tty->pgrp; + tty_nr = kdev_t_to_nr(task->tty->device); + } + task_unlock(task); + if (mm) { + struct vm_area_struct *vma; + down_read(&mm->mmap_sem); + vma = mm->mmap; + while (vma) { + vsize += vma->vm_end - vma->vm_start; + vma = vma->vm_next; + } + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + up_read(&mm->mmap_sem); + } + + wchan = get_wchan(task); + + collect_sigign_sigcatch(task, &sigign, &sigcatch); + + /* scale priority and nice values from timeslices to -20..20 */ + /* to make it look like a "normal" Unix priority/nice value */ + priority = task->counter; + priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER; + nice = task->nice; + + read_lock(&tasklist_lock); + ppid = task->pid ? task->p_opptr->pid : 0; + read_unlock(&tasklist_lock); + res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ +%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \ +%lu %lu %lu %lu %lu %lu %lu %lu %d %d\n", + task->pid, + task->comm, + state, + ppid, + task->pgrp, + task->session, + tty_nr, + tty_pgrp, + task->flags, + task->min_flt, + task->cmin_flt, + task->maj_flt, + task->cmaj_flt, + task->times.tms_utime, + task->times.tms_stime, + task->times.tms_cutime, + task->times.tms_cstime, + priority, + nice, + 0UL /* removed */, + task->it_real_value, + task->start_time, + vsize, + mm ? mm->rss : 0, /* you might want to shift this left 3 */ + task->rlim[RLIMIT_RSS].rlim_cur, + mm ? mm->start_code : 0, + mm ? mm->end_code : 0, + mm ? mm->start_stack : 0, + esp, + eip, + /* The signal information here is obsolete. + * It must be decimal for Linux 2.0 compatibility. + * Use /proc/#/status for real-time signals. + */ + task->pending.signal.sig[0] & 0x7fffffffUL, + task->blocked.sig[0] & 0x7fffffffUL, + sigign .sig[0] & 0x7fffffffUL, + sigcatch .sig[0] & 0x7fffffffUL, + wchan, + task->nswap, + task->cnswap, + task->exit_signal, + task->processor); + if(mm) + mmput(mm); + return res; +} + +static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, + int * pages, int * shared, int * dirty, int * total) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + pte = pte_offset(pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t page; + struct page *ptpage; + + conditional_schedule(); + page=*pte; + address += PAGE_SIZE; + pte++; + if (pte_none(page)) + continue; + ++*total; + if (!pte_present(page)) + continue; + ptpage = pte_page(page); + if ((!VALID_PAGE(ptpage)) || PageReserved(ptpage)) + continue; + ++*pages; + if (pte_dirty(page)) + ++*dirty; + if (page_count(pte_page(page)) > 1) + ++*shared; + } while (address < end); +} + +static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size, + int * pages, int * shared, int * dirty, int * total) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*pgd)) + return; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return; + } + pmd = pmd_offset(pgd, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + statm_pte_range(pmd, address, end - address, pages, shared, dirty, total); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); +} + +static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end, + int * pages, int * shared, int * dirty, int * total) +{ + while (address < end) { + statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + pgd++; + } +} + +int proc_pid_statm(struct task_struct *task, char * buffer) +{ + struct mm_struct *mm; + int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0; + + task_lock(task); + mm = task->mm; + if(mm) + atomic_inc(&mm->mm_users); + task_unlock(task); + if (mm) { + struct vm_area_struct * vma; + down_read(&mm->mmap_sem); + vma = mm->mmap; + while (vma) { + pgd_t *pgd = pgd_offset(mm, vma->vm_start); + int pages = 0, shared = 0, dirty = 0, total = 0; + + statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total); + resident += pages; + share += shared; + dt += dirty; + size += total; + if (vma->vm_flags & VM_EXECUTABLE) + trs += pages; /* text */ + else if (vma->vm_flags & VM_GROWSDOWN) + drs += pages; /* stack */ + else if (vma->vm_end > 0x60000000) + lrs += pages; /* library */ + else + drs += pages; + vma = vma->vm_next; + } + up_read(&mm->mmap_sem); + mmput(mm); + } + return sprintf(buffer,"%d %d %d %d %d %d %d\n", + size, resident, share, trs, lrs, drs, dt); +} + +/* + * The way we support synthetic files > 4K + * - without storing their contents in some buffer and + * - without walking through the entire synthetic file until we reach the + * position of the requested data + * is to cleverly encode the current position in the file's f_pos field. + * There is no requirement that a read() call which returns `count' bytes + * of data increases f_pos by exactly `count'. + * + * This idea is Linus' one. Bruno implemented it. + */ + +/* + * For the /proc/<pid>/maps file, we use fixed length records, each containing + * a single line. + * + * f_pos = (number of the vma in the task->mm->mmap list) * PAGE_SIZE + * + (index into the line) + */ +/* for systems with sizeof(void*) == 4: */ +#define MAPS_LINE_FORMAT4 "%08lx-%08lx %s %08lx %s %lu" +#define MAPS_LINE_MAX4 49 /* sum of 8 1 8 1 4 1 8 1 5 1 10 1 */ + +/* for systems with sizeof(void*) == 8: */ +#define MAPS_LINE_FORMAT8 "%016lx-%016lx %s %016lx %s %lu" +#define MAPS_LINE_MAX8 73 /* sum of 16 1 16 1 4 1 16 1 5 1 10 1 */ + +#define MAPS_LINE_FORMAT (sizeof(void*) == 4 ? MAPS_LINE_FORMAT4 : MAPS_LINE_FORMAT8) +#define MAPS_LINE_MAX (sizeof(void*) == 4 ? MAPS_LINE_MAX4 : MAPS_LINE_MAX8) + +static int proc_pid_maps_get_line (char *buf, struct vm_area_struct *map) +{ + /* produce the next line */ + char *line; + char str[5]; + int flags; + kdev_t dev; + unsigned long ino; + int len; + + flags = map->vm_flags; + + str[0] = flags & VM_READ ? 'r' : '-'; + str[1] = flags & VM_WRITE ? 'w' : '-'; + str[2] = flags & VM_EXEC ? 'x' : '-'; + str[3] = flags & VM_MAYSHARE ? 's' : 'p'; + str[4] = 0; + + dev = 0; + ino = 0; + if (map->vm_file != NULL) { + dev = map->vm_file->f_dentry->d_inode->i_dev; + ino = map->vm_file->f_dentry->d_inode->i_ino; + line = d_path(map->vm_file->f_dentry, + map->vm_file->f_vfsmnt, + buf, PAGE_SIZE); + buf[PAGE_SIZE-1] = '\n'; + line -= MAPS_LINE_MAX; + if(line < buf) + line = buf; + } else + line = buf; + + len = sprintf(line, + MAPS_LINE_FORMAT, + map->vm_start, map->vm_end, str, map->vm_pgoff << PAGE_SHIFT, + kdevname(dev), ino); + + if(map->vm_file) { + int i; + for(i = len; i < MAPS_LINE_MAX; i++) + line[i] = ' '; + len = buf + PAGE_SIZE - line; + memmove(buf, line, len); + } else + line[len++] = '\n'; + return len; +} + +ssize_t proc_pid_read_maps (struct task_struct *task, struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm; + struct vm_area_struct * map; + char *tmp, *kbuf; + long retval; + int off, lineno, loff; + + /* reject calls with out of range parameters immediately */ + retval = 0; + if (*ppos > LONG_MAX) + goto out; + if (count == 0) + goto out; + off = (long)*ppos; + /* + * We might sleep getting the page, so get it first. + */ + retval = -ENOMEM; + kbuf = (char*)__get_free_page(GFP_KERNEL); + if (!kbuf) + goto out; + + tmp = (char*)__get_free_page(GFP_KERNEL); + if (!tmp) + goto out_free1; + + task_lock(task); + mm = task->mm; + if (mm) + atomic_inc(&mm->mm_users); + task_unlock(task); + retval = 0; + if (!mm) + goto out_free2; + + down_read(&mm->mmap_sem); + map = mm->mmap; + lineno = 0; + loff = 0; + if (count > PAGE_SIZE) + count = PAGE_SIZE; + while (map) { + int len; + if (off > PAGE_SIZE) { + off -= PAGE_SIZE; + goto next; + } + len = proc_pid_maps_get_line(tmp, map); + len -= off; + if (len > 0) { + if (retval+len > count) { + /* only partial line transfer possible */ + len = count - retval; + /* save the offset where the next read + * must start */ + loff = len+off; + } + memcpy(kbuf+retval, tmp+off, len); + retval += len; + } + off = 0; +next: + if (!loff) + lineno++; + if (retval >= count) + break; + if (loff) BUG(); + map = map->vm_next; + } + up_read(&mm->mmap_sem); + mmput(mm); + + if (retval > count) BUG(); + if (copy_to_user(buf, kbuf, retval)) + retval = -EFAULT; + else + *ppos = (lineno << PAGE_SHIFT) + loff; + +out_free2: + free_page((unsigned long)tmp); +out_free1: + free_page((unsigned long)kbuf); +out: + return retval; +} + +#ifdef CONFIG_SMP +int proc_pid_cpu(struct task_struct *task, char * buffer) +{ + int i, len; + + len = sprintf(buffer, + "cpu %lu %lu\n", + task->times.tms_utime, + task->times.tms_stime); + + for (i = 0 ; i < smp_num_cpus; i++) + len += sprintf(buffer + len, "cpu%d %lu %lu\n", + i, + task->per_cpu_utime[cpu_logical_map(i)], + task->per_cpu_stime[cpu_logical_map(i)]); + + return len; +} +#endif diff --git a/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/filemap.c b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/filemap.c new file mode 100644 index 000000000000..8c98b0d81bf1 --- /dev/null +++ b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/filemap.c @@ -0,0 +1,3143 @@ +/* + * linux/mm/filemap.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem used to do this differently, for example) + */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/locks.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/smp_lock.h> +#include <linux/blkdev.h> +#include <linux/file.h> +#include <linux/swapctl.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/iobuf.h> + +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/mman.h> + +#include <linux/highmem.h> + +/* +* Shared mappings implemented 30.11.1994. It's not fully working yet, +* though. +* +* Shared mappings now work. 15.8.1995 Bruno. +* +* finished 'unifying' the page and buffer cache and SMP-threaded the +* page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> +* +* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> +*/ + +unsigned long page_cache_size; +unsigned int page_hash_bits; +struct page **page_hash_table; + +spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED}; + +/* +* NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock +* with the pagecache_lock held. +* +* Ordering: +* swap_lock -> +* pagemap_lru_lock -> +* pagecache_lock +*/ +spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED}; + +#define CLUSTER_PAGES (1 << page_cluster) +#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) + +static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p)); +static void add_page_to_hash_queue(struct page * page, struct page **p) +{ +struct page *next = *p; + +*p = page; +page->next_hash = next; +page->pprev_hash = p; +if (next) + next->pprev_hash = &page->next_hash; +if (page->buffers) + PAGE_BUG(page); +inc_nr_cache_pages(page); +} + +static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page) +{ +struct list_head *head = &mapping->clean_pages; + +mapping->nrpages++; +list_add(&page->list, head); +page->mapping = mapping; +} + +static inline void remove_page_from_inode_queue(struct page * page) +{ +struct address_space * mapping = page->mapping; + +mapping->nrpages--; +list_del(&page->list); +page->mapping = NULL; +} + +static inline void remove_page_from_hash_queue(struct page * page) +{ +struct page *next = page->next_hash; +struct page **pprev = page->pprev_hash; + +if (next) + next->pprev_hash = pprev; +*pprev = next; +page->pprev_hash = NULL; +dec_nr_cache_pages(page); +} + +/* +* Remove a page from the page cache and free it. Caller has to make +* sure the page is locked and that nobody else uses it - or that usage +* is safe. +*/ +void __remove_inode_page(struct page *page) +{ +if (PageDirty(page)) BUG(); +remove_page_from_inode_queue(page); +remove_page_from_hash_queue(page); +} + +void remove_inode_page(struct page *page) +{ +if (!PageLocked(page)) + PAGE_BUG(page); + +spin_lock(&pagecache_lock); +__remove_inode_page(page); +spin_unlock(&pagecache_lock); +} + +static inline int sync_page(struct page *page) +{ +struct address_space *mapping = page->mapping; + +if (mapping && mapping->a_ops && mapping->a_ops->sync_page) + return mapping->a_ops->sync_page(page); +return 0; +} + +/* +* Add a page to the dirty page list. +*/ +void set_page_dirty(struct page *page) +{ +if (!test_and_set_bit(PG_dirty, &page->flags)) { + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock(&pagecache_lock); + list_del(&page->list); + list_add(&page->list, &mapping->dirty_pages); + spin_unlock(&pagecache_lock); + + if (mapping->host) + mark_inode_dirty_pages(mapping->host); + } +} +} + +/** +* invalidate_inode_pages - Invalidate all the unlocked pages of one inode +* @inode: the inode which pages we want to invalidate +* +* This function only removes the unlocked pages, if you want to +* remove all the pages of one inode, you must call truncate_inode_pages. +*/ + +void invalidate_inode_pages(struct inode * inode) +{ +struct list_head *head, *curr; +struct page * page; + +head = &inode->i_mapping->clean_pages; + +spin_lock(&pagemap_lru_lock); +spin_lock(&pagecache_lock); +curr = head->next; + +while (curr != head) { + page = list_entry(curr, struct page, list); + curr = curr->next; + + /* We cannot invalidate something in dirty.. */ + if (PageDirty(page)) + continue; + + /* ..or locked */ + if (TryLockPage(page)) + continue; + + if (page->buffers && !try_to_free_buffers(page, 0)) + goto unlock; + + if (page_count(page) != 1) + goto unlock; + + __lru_cache_del(page); + __remove_inode_page(page); + UnlockPage(page); + page_cache_release(page); + continue; +unlock: + UnlockPage(page); + continue; +} + +spin_unlock(&pagecache_lock); +spin_unlock(&pagemap_lru_lock); +} + +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ +memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + +if (page->buffers) + block_flushpage(page, partial); + +} + +static void truncate_complete_page(struct page *page) +{ +/* Leave it on the LRU if it gets converted into anonymous buffers */ +if (!page->buffers || block_flushpage(page, 0)) + lru_cache_del(page); + +/* + * We remove the page from the page cache _after_ we have + * destroyed all buffer-cache references to it. Otherwise some + * other process might think this inode page is not in the + * page cache and creates a buffer-cache alias to it causing + * all sorts of fun problems ... + */ +ClearPageDirty(page); +ClearPageUptodate(page); +remove_inode_page(page); +page_cache_release(page); +} + +static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); +static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) +{ +struct list_head *curr; +struct page * page; +int unlocked = 0; + +restart: +curr = head->prev; +while (curr != head) { + unsigned long offset; + + page = list_entry(curr, struct page, list); + offset = page->index; + + /* Is one of the pages to truncate? */ + if ((offset >= start) || (*partial && (offset + 1) == start)) { + int failed; + + page_cache_get(page); + failed = TryLockPage(page); + + list_del(head); + if (!failed) + /* Restart after this page */ + list_add_tail(head, curr); + else + /* Restart on this page */ + list_add(head, curr); + + spin_unlock(&pagecache_lock); + conditional_schedule(); + unlocked = 1; + + if (!failed) { + if (*partial && (offset + 1) == start) { + truncate_partial_page(page, *partial); + *partial = 0; + } else + truncate_complete_page(page); + + UnlockPage(page); + } else + wait_on_page(page); + + page_cache_release(page); + + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; + } + curr = curr->prev; + } + return unlocked; +} + + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from with to truncate + * + * Truncate the page cache at a set offset, removing the pages + * that are beyond that offset (and zeroing out partial pages). + * If any page is locked we wait for it to become unlocked. + */ +void truncate_inode_pages(struct address_space * mapping, loff_t lstart) +{ + unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + int unlocked; + + spin_lock(&pagecache_lock); + do { + unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial); + } while (unlocked); + /* Traversed all three lists without dropping the lock */ + spin_unlock(&pagecache_lock); +} + +static inline int invalidate_this_page2(struct page * page, + struct list_head * curr, + struct list_head * head) +{ + int unlocked = 1; + + /* + * The page is locked and we hold the pagecache_lock as well + * so both page_count(page) and page->buffers stays constant here. + */ + if (page_count(page) == 1 + !!page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + truncate_complete_page(page); + } else { + if (page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + block_invalidate_page(page); + } else + unlocked = 0; + + ClearPageDirty(page); + ClearPageUptodate(page); + } + + return unlocked; +} + +static int FASTCALL(invalidate_list_pages2(struct list_head *)); +static int invalidate_list_pages2(struct list_head *head) +{ + struct list_head *curr; + struct page * page; + int unlocked = 0; + + restart: + curr = head->prev; + while (curr != head) { + page = list_entry(curr, struct page, list); + + if (!TryLockPage(page)) { + int __unlocked; + + __unlocked = invalidate_this_page2(page, curr, head); + UnlockPage(page); + unlocked |= __unlocked; + if (!__unlocked) { + curr = curr->prev; + continue; + } + } else { + /* Restart on this page */ + list_del(head); + list_add(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + unlocked = 1; + wait_on_page(page); + } + + page_cache_release(page); + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; + } + return unlocked; +} + +/** + * invalidate_inode_pages2 - Clear all the dirty bits around if it can't + * free the pages because they're mapped. + * @mapping: the address_space which pages we want to invalidate + */ +void invalidate_inode_pages2(struct address_space * mapping) +{ + int unlocked; + + spin_lock(&pagecache_lock); + do { + unlocked = invalidate_list_pages2(&mapping->clean_pages); + unlocked |= invalidate_list_pages2(&mapping->dirty_pages); + unlocked |= invalidate_list_pages2(&mapping->locked_pages); + } while (unlocked); + spin_unlock(&pagecache_lock); +} + +static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) +{ + goto inside; + + for (;;) { + page = page->next_hash; +inside: + if (!page) + goto not_found; + if (page->mapping != mapping) + continue; + if (page->index == offset) + break; + } + +not_found: + return page; +} + +/* + * By the time this is called, the page is locked and + * we don't have to worry about any races any more. + * + * Start the IO.. + */ +static int writeout_one_page(struct page *page) +{ + struct buffer_head *bh, *head = page->buffers; + + bh = head; + do { + if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh)) + continue; + + bh->b_flushtime = jiffies; + ll_rw_block(WRITE, 1, &bh); + } while ((bh = bh->b_this_page) != head); + return 0; +} + +int waitfor_one_page(struct page *page) +{ + int error = 0; + struct buffer_head *bh, *head = page->buffers; + + bh = head; + do { + wait_on_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + error = -EIO; + } while ((bh = bh->b_this_page) != head); + return error; +} + +static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *)) +{ + struct list_head *curr; + struct page *page; + int retval = 0; + + spin_lock(&pagecache_lock); + curr = head->next; + while (curr != head) { + page = list_entry(curr, struct page, list); + curr = curr->next; + if (!page->buffers) + continue; + if (page->index >= end) + continue; + if (page->index < start) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + conditional_schedule(); /* sys_msync() (only used by minixfs, udf) */ + lock_page(page); + + /* The buffers could have been free'd while we waited for the page lock */ + if (page->buffers) + retval |= fn(page); + + UnlockPage(page); + spin_lock(&pagecache_lock); + curr = page->list.next; + page_cache_release(page); + } + spin_unlock(&pagecache_lock); + + return retval; +} + +/* + * Two-stage data sync: first start the IO, then go back and + * collect the information.. + */ +int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx) +{ + int retval; + + /* writeout dirty buffers on pages from both clean and dirty lists */ + retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page); + + /* now wait for locked buffers on pages from both clean and dirty lists */ + retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page); + + return retval; +} + +/* + * In-memory filesystems have to fail their + * writepage function - and this has to be + * worked around in the VM layer.. + * + * We + * - mark the page dirty again (but do NOT + * add it back to the inode dirty list, as + * that would livelock in fdatasync) + * - activate the page so that the page stealer + * doesn't try to write it out over and over + * again. + */ +int fail_writepage(struct page *page) +{ + activate_page(page); + SetPageReferenced(page); + SetPageDirty(page); + UnlockPage(page); + return 0; +} + +EXPORT_SYMBOL(fail_writepage); + +/** + * filemap_fdatasync - walk the list of dirty pages of the given address space + * and writepage() all of them. + * + * @mapping: address space structure to write + * + */ +void filemap_fdatasync(struct address_space * mapping) +{ + int (*writepage)(struct page *) = mapping->a_ops->writepage; + + spin_lock(&pagecache_lock); + + while (!list_empty(&mapping->dirty_pages)) { + struct page *page = list_entry(mapping->dirty_pages.next, struct page, list); + + list_del(&page->list); + list_add(&page->list, &mapping->locked_pages); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + + conditional_schedule(); /* sys_msync() */ + + if (!PageDirty(page)) + goto clean; + + lock_page(page); + + if (PageDirty(page)) { + ClearPageDirty(page); + writepage(page); + } else + UnlockPage(page); +clean: + page_cache_release(page); + spin_lock(&pagecache_lock); + } + spin_unlock(&pagecache_lock); +} + +/** + * filemap_fdatawait - walk the list of locked pages of the given address space + * and wait for all of them. + * + * @mapping: address space structure to wait for + * + */ +void filemap_fdatawait(struct address_space * mapping) +{ + DEFINE_RESCHED_COUNT; +restart: + spin_lock(&pagecache_lock); + + while (!list_empty(&mapping->locked_pages)) { + struct page *page = list_entry(mapping->locked_pages.next, struct page, list); + + list_del(&page->list); + list_add(&page->list, &mapping->clean_pages); + + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + page_cache_get(page); + spin_unlock(&pagecache_lock); + unconditional_schedule(); + page_cache_release(page); + goto restart; + } + } + + if (!PageLocked(page)) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + + ___wait_on_page(page); + + page_cache_release(page); + spin_lock(&pagecache_lock); + } + spin_unlock(&pagecache_lock); +} + +/* + * Add a page to the inode page cache. + * + * The caller must have locked the page and + * set all the page flags correctly.. + */ +void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) +{ + if (!PageLocked(page)) + BUG(); + + page->index = index; + page_cache_get(page); + spin_lock(&pagecache_lock); + add_page_to_inode_queue(mapping, page); + add_page_to_hash_queue(page, page_hash(mapping, index)); + spin_unlock(&pagecache_lock); + + lru_cache_add(page); +} + +/* + * This adds a page to the page cache, starting out as locked, + * owned by us, but unreferenced, not uptodate and with no errors. + */ +static inline void __add_to_page_cache(struct page * page, + struct address_space *mapping, unsigned long offset, + struct page **hash) +{ + unsigned long flags; + + flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked); + page->flags = flags | (1 << PG_locked); + page_cache_get(page); + page->index = offset; + add_page_to_inode_queue(mapping, page); + add_page_to_hash_queue(page, hash); +} + +void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) +{ + spin_lock(&pagecache_lock); + __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset)); + spin_unlock(&pagecache_lock); + lru_cache_add(page); +} + +int add_to_page_cache_unique(struct page * page, + struct address_space *mapping, unsigned long offset, + struct page **hash) +{ + int err; + struct page *alias; + + spin_lock(&pagecache_lock); + alias = __find_page_nolock(mapping, offset, *hash); + + err = 1; + if (!alias) { + __add_to_page_cache(page,mapping,offset,hash); + err = 0; + } + + spin_unlock(&pagecache_lock); + if (!err) + lru_cache_add(page); + return err; +} + +/* + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. + */ +static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +static int page_cache_read(struct file * file, unsigned long offset) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct page **hash = page_hash(mapping, offset); + struct page *page; + + conditional_schedule(); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + spin_unlock(&pagecache_lock); + if (page) + return 0; + + page = page_cache_alloc(mapping); + if (!page) + return -ENOMEM; + + if (!add_to_page_cache_unique(page, mapping, offset, hash)) { + int error = mapping->a_ops->readpage(file, page); + page_cache_release(page); + return error; + } + /* + * We arrive here in the unlikely event that someone + * raced with us and added our page to the cache first. + */ + page_cache_release(page); + return 0; +} + +/* + * Read in an entire cluster at once. A cluster is usually a 64k- + * aligned block that includes the page requested in "offset." + */ +static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset, + unsigned long filesize)); +static int read_cluster_nonblocking(struct file * file, unsigned long offset, + unsigned long filesize) +{ + unsigned long pages = CLUSTER_PAGES; + + offset = CLUSTER_OFFSET(offset); + while ((pages-- > 0) && (offset < filesize)) { + int error = page_cache_read(file, offset); + if (error < 0) + return error; + offset ++; + } + + return 0; +} + +/* + * Wait for a page to get unlocked. + * + * This must be called with the caller "holding" the page, + * ie with increased "page->count" so that the page won't + * go away during the wait.. + */ +void ___wait_on_page(struct page *page) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(&page->wait, &wait); + do { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!PageLocked(page)) + break; + sync_page(page); + schedule(); + } while (PageLocked(page)); + tsk->state = TASK_RUNNING; + remove_wait_queue(&page->wait, &wait); +} + +void unlock_page(struct page *page) +{ + ClearPageLaunder(page); + smp_mb__before_clear_bit(); + if (!test_and_clear_bit(PG_locked, &(page)->flags)) + BUG(); + smp_mb__after_clear_bit(); + if (waitqueue_active(&(page)->wait)) + wake_up(&(page)->wait); +} + +/* + * Get a lock on the page, assuming we need to sleep + * to get it.. + */ +static void __lock_page(struct page *page) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue_exclusive(&page->wait, &wait); + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (PageLocked(page)) { + sync_page(page); + schedule(); + } + if (!TryLockPage(page)) + break; + } + tsk->state = TASK_RUNNING; + remove_wait_queue(&page->wait, &wait); +} + + +/* + * Get an exclusive lock on the page, optimistically + * assuming it's not locked.. + */ +void lock_page(struct page *page) +{ + if (TryLockPage(page)) + __lock_page(page); +} + +/* + * a rather lightweight function, finding and getting a reference to a + * hashed page atomically. + */ +struct page * __find_get_page(struct address_space *mapping, + unsigned long offset, struct page **hash) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + if (page) + page_cache_get(page); + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Same as above, but trylock it instead of incrementing the count. + */ +struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) +{ + struct page *page; + struct page **hash = page_hash(mapping, offset); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + if (page) { + if (TryLockPage(page)) + page = NULL; + } + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Must be called with the pagecache lock held, + * will return with it held (but it may be dropped + * during blocking operations.. + */ +static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *)); +static struct page * __find_lock_page_helper(struct address_space *mapping, + unsigned long offset, struct page *hash) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ +repeat: + conditional_schedule(); /* unlink large files */ + page = __find_page_nolock(mapping, offset, hash); + if (page) { + page_cache_get(page); + if (TryLockPage(page)) { + spin_unlock(&pagecache_lock); + lock_page(page); + spin_lock(&pagecache_lock); + + /* Has the page been re-allocated while we slept? */ + if (page->mapping != mapping || page->index != offset) { + UnlockPage(page); + page_cache_release(page); + goto repeat; + } + } + } + return page; +} + +/* + * Same as the above, but lock the page too, verifying that + * it's still valid once we own it. + */ +struct page * __find_lock_page (struct address_space *mapping, + unsigned long offset, struct page **hash) +{ + struct page *page; + + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, offset, *hash); + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Same as above, but create the page if required.. + */ +struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask) +{ + struct page *page; + struct page **hash = page_hash(mapping, index); + + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + spin_unlock(&pagecache_lock); + if (!page) { + struct page *newpage = alloc_page(gfp_mask); + page = ERR_PTR(-ENOMEM); + if (newpage) { + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + if (likely(!page)) { + page = newpage; + __add_to_page_cache(page, mapping, index, hash); + newpage = NULL; + } + spin_unlock(&pagecache_lock); + if (newpage == NULL) + lru_cache_add(page); + else + page_cache_release(newpage); + } + } + return page; +} + +/* + * Returns locked page at given index in given cache, creating it if needed. + */ +struct page *grab_cache_page(struct address_space *mapping, unsigned long index) +{ + return find_or_create_page(mapping, index, mapping->gfp_mask); +} + + +/* + * Same as grab_cache_page, but do not wait if the page is unavailable. + * This is intended for speculative data generators, where the data can + * be regenerated if the page couldn't be grabbed. This routine should + * be safe to call while holding the lock for another page. + */ +struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +{ + struct page *page, **hash; + + hash = page_hash(mapping, index); + page = __find_get_page(mapping, index, hash); + + if ( page ) { + if ( !TryLockPage(page) ) { + /* Page found and locked */ + /* This test is overly paranoid, but what the heck... */ + if ( unlikely(page->mapping != mapping || page->index != index) ) { + /* Someone reallocated this page under us. */ + UnlockPage(page); + page_cache_release(page); + return NULL; + } else { + return page; + } + } else { + /* Page locked by someone else */ + page_cache_release(page); + return NULL; + } + } + + page = page_cache_alloc(mapping); + if ( unlikely(!page) ) + return NULL; /* Failed to allocate a page */ + + if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) { + /* Someone else grabbed the page already. */ + page_cache_release(page); + return NULL; + } + + return page; +} + +#if 0 +#define PROFILE_READAHEAD +#define DEBUG_READAHEAD +#endif + +/* + * Read-ahead profiling information + * -------------------------------- + * Every PROFILE_MAXREADCOUNT, the following information is written + * to the syslog: + * Percentage of asynchronous read-ahead. + * Average of read-ahead fields context value. + * If DEBUG_READAHEAD is defined, a snapshot of these fields is written + * to the syslog. + */ + +#ifdef PROFILE_READAHEAD + +#define PROFILE_MAXREADCOUNT 1000 + +static unsigned long total_reada; +static unsigned long total_async; +static unsigned long total_ramax; +static unsigned long total_ralen; +static unsigned long total_rawin; + +static void profile_readahead(int async, struct file *filp) +{ + unsigned long flags; + + ++total_reada; + if (async) + ++total_async; + + total_ramax += filp->f_ramax; + total_ralen += filp->f_ralen; + total_rawin += filp->f_rawin; + + if (total_reada > PROFILE_MAXREADCOUNT) { + save_flags(flags); + cli(); + if (!(total_reada > PROFILE_MAXREADCOUNT)) { + restore_flags(flags); + return; + } + + printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n", + total_ramax/total_reada, + total_ralen/total_reada, + total_rawin/total_reada, + (total_async*100)/total_reada); +#ifdef DEBUG_READAHEAD + printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n", + filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend); +#endif + + total_reada = 0; + total_async = 0; + total_ramax = 0; + total_ralen = 0; + total_rawin = 0; + + restore_flags(flags); + } +} +#endif /* defined PROFILE_READAHEAD */ + +/* + * Read-ahead context: + * ------------------- + * The read ahead context fields of the "struct file" are the following: + * - f_raend : position of the first byte after the last page we tried to + * read ahead. + * - f_ramax : current read-ahead maximum size. + * - f_ralen : length of the current IO read block we tried to read-ahead. + * - f_rawin : length of the current read-ahead window. + * if last read-ahead was synchronous then + * f_rawin = f_ralen + * otherwise (was asynchronous) + * f_rawin = previous value of f_ralen + f_ralen + * + * Read-ahead limits: + * ------------------ + * MIN_READAHEAD : minimum read-ahead size when read-ahead. + * MAX_READAHEAD : maximum read-ahead size when read-ahead. + * + * Synchronous read-ahead benefits: + * -------------------------------- + * Using reasonable IO xfer length from peripheral devices increase system + * performances. + * Reasonable means, in this context, not too large but not too small. + * The actual maximum value is: + * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined + * and 32K if defined (4K page size assumed). + * + * Asynchronous read-ahead benefits: + * --------------------------------- + * Overlapping next read request and user process execution increase system + * performance. + * + * Read-ahead risks: + * ----------------- + * We have to guess which further data are needed by the user process. + * If these data are often not really needed, it's bad for system + * performances. + * However, we know that files are often accessed sequentially by + * application programs and it seems that it is possible to have some good + * strategy in that guessing. + * We only try to read-ahead files that seems to be read sequentially. + * + * Asynchronous read-ahead risks: + * ------------------------------ + * In order to maximize overlapping, we must start some asynchronous read + * request from the device, as soon as possible. + * We must be very careful about: + * - The number of effective pending IO read requests. + * ONE seems to be the only reasonable value. + * - The total memory pool usage for the file access stream. + * This maximum memory usage is implicitly 2 IO read chunks: + * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined, + * 64k if defined (4K page size assumed). + */ + +static inline int get_max_readahead(struct inode * inode) +{ + if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)]) + return MAX_READAHEAD; + return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)]; +} + +static void generic_file_readahead(int reada_ok, + struct file * filp, struct inode * inode, + struct page * page) +{ + unsigned long end_index; + unsigned long index = page->index; + unsigned long max_ahead, ahead; + unsigned long raend; + int max_readahead = get_max_readahead(inode); + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + + raend = filp->f_raend; + max_ahead = 0; + +/* + * The current page is locked. + * If the current position is inside the previous read IO request, do not + * try to reread previously read ahead pages. + * Otherwise decide or not to read ahead some pages synchronously. + * If we are not going to read ahead, set the read ahead context for this + * page only. + */ + if (PageLocked(page)) { + if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) { + raend = index; + if (raend < end_index) + max_ahead = filp->f_ramax; + filp->f_rawin = 0; + filp->f_ralen = 1; + if (!max_ahead) { + filp->f_raend = index + filp->f_ralen; + filp->f_rawin += filp->f_ralen; + } + } + } +/* + * The current page is not locked. + * If we were reading ahead and, + * if the current max read ahead size is not zero and, + * if the current position is inside the last read-ahead IO request, + * it is the moment to try to read ahead asynchronously. + * We will later force unplug device in order to force asynchronous read IO. + */ + else if (reada_ok && filp->f_ramax && raend >= 1 && + index <= raend && index + filp->f_ralen >= raend) { +/* + * Add ONE page to max_ahead in order to try to have about the same IO max size + * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE. + * Compute the position of the last page we have tried to read in order to + * begin to read ahead just at the next page. + */ + raend -= 1; + if (raend < end_index) + max_ahead = filp->f_ramax + 1; + + if (max_ahead) { + filp->f_rawin = filp->f_ralen; + filp->f_ralen = 0; + reada_ok = 2; + } + } +/* + * Try to read ahead pages. + * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the + * scheduler, will work enough for us to avoid too bad actuals IO requests. + */ + ahead = 0; + while (ahead < max_ahead) { + ahead ++; + if ((raend + ahead) >= end_index) + break; + if (page_cache_read(filp, raend + ahead) < 0) + break; + } +/* + * If we tried to read ahead some pages, + * If we tried to read ahead asynchronously, + * Try to force unplug of the device in order to start an asynchronous + * read IO request. + * Update the read-ahead context. + * Store the length of the current read-ahead window. + * Double the current max read ahead size. + * That heuristic avoid to do some large IO for files that are not really + * accessed sequentially. + */ + if (ahead) { + filp->f_ralen += ahead; + filp->f_rawin += filp->f_ralen; + filp->f_raend = raend + ahead + 1; + + filp->f_ramax += filp->f_ramax; + + if (filp->f_ramax > max_readahead) + filp->f_ramax = max_readahead; + +#ifdef PROFILE_READAHEAD + profile_readahead((reada_ok == 2), filp); +#endif + } + + return; +} + +/* + * Mark a page as having seen activity. + * + * If it was already so marked, move it + * to the active queue and drop the referenced + * bit. Otherwise, just mark it for future + * action.. + */ +void mark_page_accessed(struct page *page) +{ + if (!PageActive(page) && PageReferenced(page)) { + activate_page(page); + ClearPageReferenced(page); + return; + } + + /* Mark the page referenced, AFTER checking for previous usage.. */ + SetPageReferenced(page); +} + +/* + * This is a generic file read routine, and uses the + * inode->i_op->readpage() function for the actual low-level + * stuff. + * + * This is really ugly. But the goto's actually try to clarify some + * of the logic when it comes to error handling etc. + */ +void __do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor, int nonblock) +{ + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + unsigned long index, offset; + struct page *cached_page; + int reada_ok; + int error; + int max_readahead = get_max_readahead(inode); + + cached_page = NULL; + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + +/* + * If the current position is outside the previous read-ahead window, + * we reset the current read-ahead context and set read ahead max to zero + * (will be set to just needed value later), + * otherwise, we assume that the file accesses are sequential enough to + * continue read-ahead. + */ + if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) { + reada_ok = 0; + filp->f_raend = 0; + filp->f_ralen = 0; + filp->f_ramax = 0; + filp->f_rawin = 0; + } else { + reada_ok = 1; + } +/* + * Adjust the current value of read-ahead max. + * If the read operation stay in the first half page, force no readahead. + * Otherwise try to increase read ahead max just enough to do the read request. + * Then, at least MIN_READAHEAD if read ahead is ok, + * and at most MAX_READAHEAD in all cases. + */ + if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) { + filp->f_ramax = 0; + } else { + unsigned long needed; + + needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1; + + if (filp->f_ramax < needed) + filp->f_ramax = needed; + + if (reada_ok && filp->f_ramax < MIN_READAHEAD) + filp->f_ramax = MIN_READAHEAD; + if (filp->f_ramax > max_readahead) + filp->f_ramax = max_readahead; + } + + for (;;) { + struct page *page, **hash; + unsigned long end_index, nr, ret; + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + + if (index > end_index) + break; + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = inode->i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + nr = nr - offset; + + /* + * Try to find the data in the page cache.. + */ + hash = page_hash(mapping, index); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, index, *hash); + if (!page) + goto no_cached_page; +found_page: + page_cache_get(page); + spin_unlock(&pagecache_lock); + + conditional_schedule(); + + if (!Page_Uptodate(page)) { + if (nonblock) { + page_cache_release(page); + desc->error = -EWOULDBLOCKIO; + break; + } + goto page_not_up_to_date; + } + if (!nonblock) + generic_file_readahead(reada_ok, filp, inode, page); +page_ok: + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping->i_mmap_shared != NULL) + flush_dcache_page(page); + + /* + * Mark the page accessed if we read the + * beginning or we just did an lseek. + */ + if (!offset || !filp->f_reada) + mark_page_accessed(page); + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + + conditional_schedule(); + + if (ret == nr && desc->count) + continue; + break; + +/* + * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. + */ +page_not_up_to_date: + generic_file_readahead(reada_ok, filp, inode, page); + + if (Page_Uptodate(page)) + goto page_ok; + + /* Get exclusive access to the page ... */ + lock_page(page); + + /* Did it get unhashed before we got the lock? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + continue; + } + + /* Did somebody else fill it already? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto page_ok; + } + +readpage: + /* ... and start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (!error) { + if (Page_Uptodate(page)) + goto page_ok; + + /* Again, try some read-ahead while waiting for the page to finish.. */ + generic_file_readahead(reada_ok, filp, inode, page); + wait_on_page(page); + if (Page_Uptodate(page)) + goto page_ok; + error = -EIO; + } + + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); + break; + +no_cached_page: + if (nonblock) { + spin_unlock(&pagecache_lock); + desc->error = -EWOULDBLOCKIO; + break; + } + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + * + * We get here with the page cache lock held. + */ + if (!cached_page) { + spin_unlock(&pagecache_lock); + cached_page = page_cache_alloc(mapping); + if (!cached_page) { + desc->error = -ENOMEM; + break; + } + + /* + * Somebody may have added the page while we + * dropped the page cache lock. Check for that. + */ + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, index, *hash); + if (page) + goto found_page; + } + + /* + * Ok, add the new page to the hash-queues... + */ + page = cached_page; + __add_to_page_cache(page, mapping, index, hash); + spin_unlock(&pagecache_lock); + lru_cache_add(page); + cached_page = NULL; + + goto readpage; + } + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + filp->f_reada = 1; + if (cached_page) + page_cache_release(cached_page); + UPDATE_ATIME(inode); +} + +static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset) +{ + ssize_t retval; + int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress; + struct kiobuf * iobuf; + struct inode * inode = filp->f_dentry->d_inode; + struct address_space * mapping = inode->i_mapping; + + new_iobuf = 0; + iobuf = filp->f_iobuf; + if (test_and_set_bit(0, &filp->f_iobuf_lock)) { + /* + * A parallel read/write is using the preallocated iobuf + * so just run slow and allocate a new one. + */ + retval = alloc_kiovec(1, &iobuf); + if (retval) + goto out; + new_iobuf = 1; + } + + blocksize = 1 << inode->i_blkbits; + blocksize_bits = inode->i_blkbits; + blocksize_mask = blocksize - 1; + chunk_size = KIO_MAX_ATOMIC_IO << 10; + + retval = -EINVAL; + if ((offset & blocksize_mask) || (count & blocksize_mask)) + goto out_free; + if (!mapping->a_ops->direct_IO) + goto out_free; + + /* + * Flush to disk exlusively the _data_, metadata must remains + * completly asynchronous or performance will go to /dev/null. + */ + filemap_fdatasync(mapping); + retval = fsync_inode_data_buffers(inode); + filemap_fdatawait(mapping); + if (retval < 0) + goto out_free; + + progress = retval = 0; + while (count > 0) { + iosize = count; + if (iosize > chunk_size) + iosize = chunk_size; + + retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); + if (retval) + break; + + retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); + + if (rw == READ && retval > 0) + mark_dirty_kiobuf(iobuf, retval); + + if (retval >= 0) { + count -= retval; + buf += retval; + progress += retval; + } + + unmap_kiobuf(iobuf); + + if (retval != iosize) + break; + } + + if (progress) + retval = progress; + + out_free: + if (!new_iobuf) + clear_bit(0, &filp->f_iobuf_lock); + else + free_kiovec(1, &iobuf); + out: + return retval; +} + +int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long left, count = desc->count; + + if (size > count) + size = count; + + kaddr = kmap(page); + left = __copy_to_user(desc->buf, kaddr + offset, size); + kunmap(page); + + if (left) { + size -= left; + desc->error = -EFAULT; + } + desc->count = count - size; + desc->written += size; + desc->buf += size; + return size; +} + +/* + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) +{ + ssize_t retval; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (filp->f_flags & O_DIRECT) + goto o_direct; + + retval = -EFAULT; + if (access_ok(VERIFY_WRITE, buf, count)) { + retval = 0; + + if (count) { + read_descriptor_t desc; + + desc.written = 0; + desc.count = count; + desc.buf = buf; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + + retval = desc.written; + if (!retval) + retval = desc.error; + } + } + out: + return retval; + + o_direct: + { + loff_t pos = *ppos, size; + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + retval = 0; + if (!count) + goto out; /* skip atime */ + size = inode->i_size; + if (pos < size) { + if (pos + count > size) + count = size - pos; + retval = generic_file_direct_IO(READ, filp, buf, count, pos); + if (retval > 0) + *ppos = pos + retval; + } + UPDATE_ATIME(filp->f_dentry->d_inode); + goto out; + } +} + +static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) +{ + ssize_t written; + unsigned long count = desc->count; + struct file *file = (struct file *) desc->buf; + + if (size > count) + size = count; + + if (file->f_op->sendpage) { + written = file->f_op->sendpage(file, page, offset, + size, &file->f_pos, size<count); + } else { + char *kaddr; + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + + kaddr = kmap(page); + written = file->f_op->write(file, kaddr + offset, size, &file->f_pos); + kunmap(page); + + set_fs(old_fs); + } + if (written < 0) { + desc->error = written; + written = 0; + } + desc->count = count - written; + desc->written += written; + return written; +} + +asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + ssize_t retval; + struct file * in_file, * out_file; + struct inode * in_inode, * out_inode; + + /* + * Get input file, and verify that it is ok.. + */ + retval = -EBADF; + in_file = fget(in_fd); + if (!in_file) + goto out; + if (!(in_file->f_mode & FMODE_READ)) + goto fput_in; + retval = -EINVAL; + in_inode = in_file->f_dentry->d_inode; + if (!in_inode) + goto fput_in; + if (!in_inode->i_mapping->a_ops->readpage) + goto fput_in; + retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count); + if (retval) + goto fput_in; + + /* + * Get output file, and verify that it is ok.. + */ + retval = -EBADF; + out_file = fget(out_fd); + if (!out_file) + goto fput_in; + if (!(out_file->f_mode & FMODE_WRITE)) + goto fput_out; + retval = -EINVAL; + if (!out_file->f_op || !out_file->f_op->write) + goto fput_out; + out_inode = out_file->f_dentry->d_inode; + retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count); + if (retval) + goto fput_out; + + retval = 0; + if (count) { + read_descriptor_t desc; + loff_t pos = 0, *ppos; + + retval = -EFAULT; + ppos = &in_file->f_pos; + if (offset) { + if (get_user(pos, offset)) + goto fput_out; + ppos = &pos; + } + + desc.written = 0; + desc.count = count; + desc.buf = (char *) out_file; + desc.error = 0; + do_generic_file_read(in_file, ppos, &desc, file_send_actor); + + retval = desc.written; + if (!retval) + retval = desc.error; + if (offset) + put_user(pos, offset); + } + +fput_out: + fput(out_file); +fput_in: + fput(in_file); +out: + return retval; +} + +static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + unsigned long max; + + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + return -EINVAL; + + /* Limit it to the size of the file.. */ + max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT; + if (index > max) + return 0; + max -= index; + if (nr > max) + nr = max; + + /* And limit it to a sane percentage of the inactive list.. */ + max = nr_inactive_pages / 2; + if (nr > max) + nr = max; + + while (nr) { + page_cache_read(file, index); + index++; + nr--; + } + return 0; +} + +asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct file *file; + + ret = -EBADF; + file = fget(fd); + if (file) { + if (file->f_mode & FMODE_READ) { + unsigned long start = offset >> PAGE_CACHE_SHIFT; + unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT; + ret = do_readahead(file, start, len); + } + fput(file); + } + return ret; +} + +/* + * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are + * sure this is sequential access, we don't need a flexible read-ahead + * window size -- we can always use a large fixed size window. + */ +static void nopage_sequential_readahead(struct vm_area_struct * vma, + unsigned long pgoff, unsigned long filesize) +{ + unsigned long ra_window; + + ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode); + ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1); + + /* vm_raend is zero if we haven't read ahead in this area yet. */ + if (vma->vm_raend == 0) + vma->vm_raend = vma->vm_pgoff + ra_window; + + /* + * If we've just faulted the page half-way through our window, + * then schedule reads for the next window, and release the + * pages in the previous window. + */ + if ((pgoff + (ra_window >> 1)) == vma->vm_raend) { + unsigned long vm_raend = *(volatile unsigned long *) &vma->vm_raend; + unsigned long start = vma->vm_pgoff + vm_raend; + unsigned long end = start + ra_window; + + if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff)) + end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff; + /* + * Sanitize 'start' as well because vm_raend is racy when only + * the read sem is acquired like here. + */ + if (start < vma->vm_pgoff) + return; + if (start > end) + return; + + while ((start < end) && (start < filesize)) { + if (read_cluster_nonblocking(vma->vm_file, + start, filesize) < 0) + break; + start += CLUSTER_PAGES; + } + run_task_queue(&tq_disk); + + /* if we're far enough past the beginning of this area, + recycle pages that are in the previous window. */ + if (vm_raend > (vma->vm_pgoff + ra_window + ra_window)) { + unsigned long window = ra_window << PAGE_SHIFT; + + end = vma->vm_start + (vm_raend << PAGE_SHIFT); + end -= window + window; + filemap_sync(vma, end - window, window, MS_INVALIDATE); + } + + vma->vm_raend += ra_window; + } + + return; +} + +/* + * filemap_nopage() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + */ +struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused) +{ + int error; + struct file *file = area->vm_file; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + struct page *page, **hash; + unsigned long size, pgoff, endoff; + + pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + +retry_all: + /* + * An external ptracer can access pages that normally aren't + * accessible.. + */ + size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if ((pgoff >= size) && (area->vm_mm == current->mm)) + return NULL; + + /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */ + if (size > endoff) + size = endoff; + + /* + * Do we have something in the page cache already? + */ + hash = page_hash(mapping, pgoff); +retry_find: + page = __find_get_page(mapping, pgoff, hash); + if (!page) + goto no_cached_page; + + /* + * Ok, found a page in the page cache, now we need to check + * that it's up-to-date. + */ + if (!Page_Uptodate(page)) + goto page_not_uptodate; + +success: + /* + * Try read-ahead for sequential areas. + */ + if (VM_SequentialReadHint(area)) + nopage_sequential_readahead(area, pgoff, size); + + /* + * Found the page and have a reference on it, need to check sharing + * and possibly copy it over to another page.. + */ + activate_page(page); + return page; + +no_cached_page: + /* + * If the requested offset is within our file, try to read a whole + * cluster of pages at once. + * + * Otherwise, we're off the end of a privately mapped file, + * so we need to map a zero page. + */ + if ((pgoff < size) && !VM_RandomReadHint(area)) + error = read_cluster_nonblocking(file, pgoff, size); + else + error = page_cache_read(file, pgoff); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + if (error == -ENOMEM) + return NOPAGE_OOM; + return NULL; + +page_not_uptodate: + lock_page(page); + + /* Did it get unhashed while we waited for it? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Did somebody else get it up-to-date? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto success; + } + + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page(page); + if (Page_Uptodate(page)) + goto success; + } + + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + lock_page(page); + + /* Somebody truncated the page on us? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Somebody else successfully read it in? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto success; + } + ClearPageError(page); + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page(page); + if (Page_Uptodate(page)) + goto success; + } + + /* + * Things didn't work out. Return zero to tell the + * mm layer so, possibly freeing the page cache page first. + */ + page_cache_release(page); + return NULL; +} + +/* Called with mm->page_table_lock held to protect against other + * threads/the swapper from ripping pte's out from under us. + */ +static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + pte_t pte = *ptep; + + if (pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) { + flush_tlb_page(vma, address); + set_page_dirty(page); + } + } + return 0; +} + +static inline int filemap_sync_pte_range(pmd_t * pmd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned long offset, unsigned int flags) +{ + pte_t * pte; + unsigned long end; + int error; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return 0; + } + pte = pte_offset(pmd, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + error = 0; + do { + error |= filemap_sync_pte(pte, vma, address + offset, flags); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); + + if (conditional_schedule_needed()) { + spin_unlock(&vma->vm_mm->page_table_lock); + unconditional_schedule(); /* syncing large mapped files */ + spin_lock(&vma->vm_mm->page_table_lock); + } + return error; +} + +static inline int filemap_sync_pmd_range(pgd_t * pgd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned int flags) +{ + pmd_t * pmd; + unsigned long offset, end; + int error; + + if (pgd_none(*pgd)) + return 0; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return 0; + } + pmd = pmd_offset(pgd, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + error = 0; + do { + error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return error; +} + +int filemap_sync(struct vm_area_struct * vma, unsigned long address, + size_t size, unsigned int flags) +{ + pgd_t * dir; + unsigned long end = address + size; + int error = 0; + + /* Aquire the lock early; it may be possible to avoid dropping + * and reaquiring it repeatedly. + */ + spin_lock(&vma->vm_mm->page_table_lock); + + dir = pgd_offset(vma->vm_mm, address); + flush_cache_range(vma->vm_mm, end - size, end); + if (address >= end) + BUG(); + do { + error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + flush_tlb_range(vma->vm_mm, end - size, end); + + spin_unlock(&vma->vm_mm->page_table_lock); + + return error; +} + +static struct vm_operations_struct generic_file_vm_ops = { + nopage: filemap_nopage, +}; + +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { + if (!mapping->a_ops->writepage) + return -EINVAL; + } + if (!mapping->a_ops->readpage) + return -ENOEXEC; + UPDATE_ATIME(inode); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +/* + * The msync() system call. + */ + +static int msync_interval(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int flags) +{ + struct file * file = vma->vm_file; + if (file && (vma->vm_flags & VM_SHARED)) { + int error; + error = filemap_sync(vma, start, end-start, flags); + + if (!error && (flags & MS_SYNC)) { + struct inode * inode = file->f_dentry->d_inode; + down(&inode->i_sem); + filemap_fdatasync(inode->i_mapping); + if (file->f_op && file->f_op->fsync) + error = file->f_op->fsync(file, file->f_dentry, 1); + filemap_fdatawait(inode->i_mapping); + up(&inode->i_sem); + } + return error; + } + return 0; +} + +asmlinkage long sys_msync(unsigned long start, size_t len, int flags) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error, error = -EINVAL; + + down_read(¤t->mm->mmap_sem); + if (start & ~PAGE_MASK) + goto out; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) + goto out; + error = 0; + if (end == start) + goto out; + /* + * If the interval [start,end) covers some unmapped address ranges, + * just ignore them, but return -EFAULT at the end. + */ + vma = find_vma(current->mm, start); + unmapped_error = 0; + for (;;) { + /* Still start < end. */ + error = -EFAULT; + if (!vma) + goto out; + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -EFAULT; + start = vma->vm_start; + } + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = msync_interval(vma, start, end, flags); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } +out: + up_read(¤t->mm->mmap_sem); + return error; +} + +static inline void setup_read_behavior(struct vm_area_struct * vma, + int behavior) +{ + VM_ClearReadHint(vma); + switch(behavior) { + case MADV_SEQUENTIAL: + vma->vm_flags |= VM_SEQ_READ; + break; + case MADV_RANDOM: + vma->vm_flags |= VM_RAND_READ; + break; + default: + break; + } + return; +} + +static long madvise_fixup_start(struct vm_area_struct * vma, + unsigned long end, int behavior) +{ + struct vm_area_struct * n; + struct mm_struct * mm = vma->vm_mm; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_end = end; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_start = end; + __insert_vm_struct(mm, n); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static long madvise_fixup_end(struct vm_area_struct * vma, + unsigned long start, int behavior) +{ + struct vm_area_struct * n; + struct mm_struct * mm = vma->vm_mm; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_start = start; + n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_end = start; + __insert_vm_struct(mm, n); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static long madvise_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + struct vm_area_struct * left, * right; + struct mm_struct * mm = vma->vm_mm; + + left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!left) + return -EAGAIN; + right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!right) { + kmem_cache_free(vm_area_cachep, left); + return -EAGAIN; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + right->vm_start = end; + right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; + left->vm_raend = 0; + right->vm_raend = 0; + if (vma->vm_file) + atomic_add(2, &vma->vm_file->f_count); + + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; + vma->vm_raend = 0; + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_start = start; + vma->vm_end = end; + setup_read_behavior(vma, behavior); + __insert_vm_struct(mm, left); + __insert_vm_struct(mm, right); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +/* + * We can potentially split a vm area into separate + * areas, each area with its own behavior. + */ +static long madvise_behavior(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + int error = 0; + + /* This caps the number of vma's this process can own */ + if (vma->vm_mm->map_count > MAX_MAP_COUNT) + return -ENOMEM; + + if (start == vma->vm_start) { + if (end == vma->vm_end) { + setup_read_behavior(vma, behavior); + vma->vm_raend = 0; + } else + error = madvise_fixup_start(vma, end, behavior); + } else { + if (end == vma->vm_end) + error = madvise_fixup_end(vma, start, behavior); + else + error = madvise_fixup_middle(vma, start, end, behavior); + } + + return error; +} + +/* + * Schedule all required I/O operations, then run the disk queue + * to make sure they are started. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + long error = -EBADF; + struct file * file; + unsigned long size, rlim_rss; + + /* Doesn't work if there's no mapped file. */ + if (!vma->vm_file) + return error; + file = vma->vm_file; + size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + /* Make sure this doesn't exceed the process's max rss. */ + error = -EIO; + rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur : + LONG_MAX; /* default: see resource.h */ + if ((vma->vm_mm->rss + (end - start)) > rlim_rss) + return error; + + /* round to cluster boundaries if this isn't a "random" area. */ + if (!VM_RandomReadHint(vma)) { + start = CLUSTER_OFFSET(start); + end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1); + + while ((start < end) && (start < size)) { + error = read_cluster_nonblocking(file, start, size); + start += CLUSTER_PAGES; + if (error < 0) + break; + } + } else { + while ((start < end) && (start < size)) { + error = page_cache_read(file, start); + start++; + if (error < 0) + break; + } + } + + /* Don't wait for someone else to push these requests. */ + run_task_queue(&tq_disk); + + return error; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range call sets things up for refill_inactive to actually free + * these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * refill_inactive to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_flags & VM_LOCKED) + return -EINVAL; + + zap_page_range(vma->vm_mm, start, end - start, + ZPR_FLUSH_CACHE|ZPR_FLUSH_TLB|ZPR_COND_RESCHED); /* sys_madvise(MADV_DONTNEED) */ + + return 0; +} + +static long madvise_vma(struct vm_area_struct * vma, unsigned long start, + unsigned long end, int behavior) +{ + long error = -EBADF; + + switch (behavior) { + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + error = madvise_behavior(vma, start, end, behavior); + break; + + case MADV_WILLNEED: + error = madvise_willneed(vma, start, end); + break; + + case MADV_DONTNEED: + error = madvise_dontneed(vma, start, end); + break; + + default: + error = -EINVAL; + break; + } + + return error; +} + +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + int error = -EINVAL; + + down_write(¤t->mm->mmap_sem); + + if (start & ~PAGE_MASK) + goto out; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = madvise_vma(vma, start, end, + behavior); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = madvise_vma(vma, start, vma->vm_end, behavior); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: + up_write(¤t->mm->mmap_sem); + return error; +} + +/* + * Later we can get more picky about what "in core" means precisely. + * For now, simply check to see if the page is in the page cache, + * and is up to date; i.e. that no page-in operation would be required + * at this time if an application were to map and access this page. + */ +static unsigned char mincore_page(struct vm_area_struct * vma, + unsigned long pgoff) +{ + unsigned char present = 0; + struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; + struct page * page, ** hash = page_hash(as, pgoff); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(as, pgoff, *hash); + if ((page) && (Page_Uptodate(page))) + present = 1; + spin_unlock(&pagecache_lock); + + return present; +} + +static long mincore_vma(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned char * vec) +{ + long error, i, remaining; + unsigned char * tmp; + + error = -ENOMEM; + if (!vma->vm_file) + return error; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + error = -EAGAIN; + tmp = (unsigned char *) __get_free_page(GFP_KERNEL); + if (!tmp) + return error; + + /* (end - start) is # of pages, and also # of bytes in "vec */ + remaining = (end - start), + + error = 0; + for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { + int j = 0; + long thispiece = (remaining < PAGE_SIZE) ? + remaining : PAGE_SIZE; + + while (j < thispiece) + tmp[j++] = mincore_page(vma, start++); + + if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { + error = -EFAULT; + break; + } + } + + free_page((unsigned long) tmp); + return error; +} + +/* + * The mincore(2) system call. + * + * mincore() returns the memory residency status of the pages in the + * current process's address space specified by [addr, addr + len). + * The status is returned in a vector of bytes. The least significant + * bit of each byte is 1 if the referenced page is in memory, otherwise + * it is zero. + * + * Because the status of a page can change after mincore() checks it + * but before it returns to the application, the returned vector may + * contain stale information. Only locked pages are guaranteed to + * remain in memory. + * + * return values: + * zero - success + * -EFAULT - vec points to an illegal address + * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE, + * or len has a nonpositive value + * -ENOMEM - Addresses in the range [addr, addr + len] are + * invalid for the address space of this process, or + * specify one or more pages which are not currently + * mapped + * -EAGAIN - A kernel resource was temporarily unavailable. + */ +asmlinkage long sys_mincore(unsigned long start, size_t len, + unsigned char * vec) +{ + int index = 0; + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + long error = -EINVAL; + + down_read(¤t->mm->mmap_sem); + + if (start & ~PAGE_CACHE_MASK) + goto out; + len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK; + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = mincore_vma(vma, start, end, + &vec[index]); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = mincore_vma(vma, start, vma->vm_end, &vec[index]); + if (error) + goto out; + index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: + up_read(¤t->mm->mmap_sem); + return error; +} + +static inline +struct page *__read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page **hash = page_hash(mapping, index); + struct page *page, *cached_page = NULL; + int err; +repeat: + page = __find_get_page(mapping, index, hash); + if (!page) { + if (!cached_page) { + cached_page = page_cache_alloc(mapping); + if (!cached_page) + return ERR_PTR(-ENOMEM); + } + page = cached_page; + if (add_to_page_cache_unique(page, mapping, index, hash)) + goto repeat; + cached_page = NULL; + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + } + if (cached_page) + page_cache_release(cached_page); + return page; +} + +/* + * Read into the page cache. If a page already exists, + * and Page_Uptodate() is not set, try to fill the page. + */ +struct page *read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + int err; + +retry: + page = __read_cache_page(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + mark_page_accessed(page); + if (Page_Uptodate(page)) + goto out; + + lock_page(page); + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry; + } + if (Page_Uptodate(page)) { + UnlockPage(page); + goto out; + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + out: + return page; +} + +static inline struct page * __grab_cache_page(struct address_space *mapping, + unsigned long index, struct page **cached_page) +{ + struct page *page, **hash = page_hash(mapping, index); +repeat: + page = __find_lock_page(mapping, index, hash); + if (!page) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (!*cached_page) + return NULL; + } + page = *cached_page; + if (add_to_page_cache_unique(page, mapping, index, hash)) + goto repeat; + *cached_page = NULL; + } + return page; +} + +inline void remove_suid(struct inode *inode) +{ + unsigned int mode; + + /* set S_IGID if S_IXGRP is set, and always set S_ISUID */ + mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID; + + /* was any of the uid bits set? */ + mode &= inode->i_mode; + if (mode && !capable(CAP_FSETID)) { + inode->i_mode &= ~mode; + mark_inode_dirty(inode); + } +} + +/* + * Write to a file through the page cache. + * + * We currently put everything into the page cache prior to writing it. + * This is not a problem when writing full pages. With partial pages, + * however, we first have to read the data into the cache, then + * dirty the page, and finally schedule it for writing. Alternatively, we + * could write-through just the portion of data that would go into that + * page, but that would kill performance for applications that write data + * line by line, and it's prone to race conditions. + * + * Note that this routine doesn't try to keep track of dirty pages. Each + * file system has to do this all by itself, unfortunately. + * okir@monad.swb.de + */ +ssize_t +generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + loff_t pos; + struct page *page, *cached_page; + unsigned long written; + long status = 0; + int err; + unsigned bytes; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + cached_page = NULL; + + down(&inode->i_sem); + + pos = *ppos; + err = -EINVAL; + if (pos < 0) + goto out; + + err = file->f_error; + if (err) { + file->f_error = 0; + goto out; + } + + written = 0; + + /* FIXME: this is for backwards compatibility with 2.4 */ + if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) + pos = inode->i_size; + + /* + * Check whether we've reached the file size limit. + */ + err = -EFBIG; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) { + /* send_sig(SIGXFSZ, current, 0); */ + count = limit - (u32)pos; + } + } + + /* + * LFS rule + */ + if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { + if (pos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (count > MAX_NON_LFS - (u32)pos) { + /* send_sig(SIGXFSZ, current, 0); */ + count = MAX_NON_LFS - (u32)pos; + } + } + + /* + * Are we about to exceed the fs block limit ? + * + * If we have written data it becomes a short write + * If we have exceeded without writing data we send + * a signal and give them an EFBIG. + * + * Linus frestrict idea will clean these up nicely.. + */ + + if (!S_ISBLK(inode->i_mode)) { + if (pos >= inode->i_sb->s_maxbytes) + { + if (count || pos > inode->i_sb->s_maxbytes) { + send_sig(SIGXFSZ, current, 0); + err = -EFBIG; + goto out; + } + /* zero-length writes at ->s_maxbytes are OK */ + } + + if (pos + count > inode->i_sb->s_maxbytes) + count = inode->i_sb->s_maxbytes - pos; + } else { + if (is_read_only(inode->i_rdev)) { + err = -EPERM; + goto out; + } + if (pos >= inode->i_size) { + if (count || pos > inode->i_size) { + err = -ENOSPC; + goto out; + } + } + + if (pos + count > inode->i_size) + count = inode->i_size - pos; + } + + err = 0; + if (count == 0) + goto out; + + remove_suid(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + + if (file->f_flags & O_DIRECT) + goto o_direct; + + do { + unsigned long index, offset; + long page_fault; + char *kaddr; + + /* + * Try to find the page in the cache. If it isn't there, + * allocate a free page. + */ + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + */ + { volatile unsigned char dummy; + __get_user(dummy, buf); + __get_user(dummy, buf+bytes-1); + } + + status = -ENOMEM; /* we'll assign it later anyway */ + page = __grab_cache_page(mapping, index, &cached_page); + if (!page) + break; + + /* We have exclusive IO access to the page.. */ + if (!PageLocked(page)) { + PAGE_BUG(page); + } + + kaddr = kmap(page); + status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes); + if (status) + goto unlock; + page_fault = __copy_from_user(kaddr+offset, buf, bytes); + flush_dcache_page(page); + + conditional_schedule(); + + status = mapping->a_ops->commit_write(file, page, offset, offset+bytes); + if (page_fault) + goto fail_write; + if (!status) + status = bytes; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + } +unlock: + kunmap(page); + + /* + * Mark the page accessed if we wrote the + * beginning or we just did an lseek. + */ + if (!offset || !file->f_reada) + SetPageReferenced(page); + + /* Mark it unlocked again and drop the page.. */ + UnlockPage(page); + page_cache_release(page); + + conditional_schedule(); + + if (status < 0) + break; + } while (count); + *ppos = pos; + + if (cached_page) + page_cache_release(cached_page); + + /* For now, when the user asks for O_SYNC, we'll actually + * provide O_DSYNC. */ + if ((status >= 0) && (file->f_flags & O_SYNC)) + status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); + +out_status: + err = written ? written : status; +out: + + up(&inode->i_sem); + return err; +fail_write: + status = -EFAULT; + goto unlock; + +o_direct: + written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos); + if (written > 0) { + loff_t end = pos + written; + if (end > inode->i_size && !S_ISBLK(inode->i_mode)) { + inode->i_size = end; + mark_inode_dirty(inode); + } + *ppos = end; + invalidate_inode_pages2(mapping); + } + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + */ + if (written >= 0 && file->f_flags & O_SYNC) + status = generic_osync_inode(inode, OSYNC_METADATA); + goto out_status; +} + +void __init page_cache_init(unsigned long mempages) +{ + unsigned long htable_size, order; + + htable_size = mempages; + htable_size *= sizeof(struct page *); + for(order = 0; (PAGE_SIZE << order) < htable_size; order++) + ; + + do { + unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *); + + page_hash_bits = 0; + while((tmp >>= 1UL) != 0UL) + page_hash_bits++; + + page_hash_table = (struct page **) + __get_free_pages(GFP_ATOMIC, order); + } while(page_hash_table == NULL && --order > 0); + + printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n", + (1 << page_hash_bits), order, (PAGE_SIZE << order)); + if (!page_hash_table) + panic("Failed to allocate page hash table\n"); + memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); +} diff --git a/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/sched.h b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/sched.h new file mode 100644 index 000000000000..c2e891695008 --- /dev/null +++ b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/sched.h @@ -0,0 +1,944 @@ +#ifndef _LINUX_SCHED_H +#define _LINUX_SCHED_H + +#include <asm/param.h> /* for HZ */ + +extern unsigned long event; + +#include <linux/config.h> +#include <linux/binfmts.h> +#include <linux/threads.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/times.h> +#include <linux/timex.h> +#include <linux/rbtree.h> +#include <linux/condsched.h> + +#include <asm/system.h> +#include <asm/semaphore.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/mmu.h> + +#include <linux/smp.h> +#include <linux/tty.h> +#include <linux/sem.h> +#include <linux/signal.h> +#include <linux/securebits.h> +#include <linux/fs_struct.h> +#include <linux/low-latency.h> +#include <linux/numa_sched.h> + +struct exec_domain; + +/* + * cloning flags: + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PID 0x00001000 /* set if pid shared */ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ + +#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) + +/* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ +extern unsigned long avenrun[]; /* Load averages */ + +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ +#define LOAD_FREQ (5*HZ) /* 5 sec intervals */ +#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ +#define EXP_5 2014 /* 1/exp(5sec/5min) */ +#define EXP_15 2037 /* 1/exp(5sec/15min) */ + +#define CALC_LOAD(load,exp,n) \ + load *= exp; \ + load += n*(FIXED_1-exp); \ + load >>= FSHIFT; + +#define CT_TO_SECS(x) ((x) / HZ) +#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) + +extern int nr_running, nr_threads; +extern int last_pid; + +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/param.h> +#include <linux/resource.h> +#include <linux/timer.h> + +#include <asm/processor.h> + +#define TASK_RUNNING 0 +#define TASK_INTERRUPTIBLE 1 +#define TASK_UNINTERRUPTIBLE 2 +#define TASK_ZOMBIE 4 +#define TASK_STOPPED 8 + +#define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +#ifdef CONFIG_SMP +#define set_task_state(tsk, state_value) \ + set_mb((tsk)->state, (state_value)) +#else +#define set_task_state(tsk, state_value) \ + __set_task_state((tsk), (state_value)) +#endif + +#define __set_current_state(state_value) \ + do { current->state = (state_value); } while (0) +#ifdef CONFIG_SMP +#define set_current_state(state_value) \ + set_mb(current->state, (state_value)) +#else +#define set_current_state(state_value) \ + __set_current_state(state_value) +#endif + +/* + * Scheduling policies + */ +#define SCHED_OTHER 0 +#define SCHED_FIFO 1 +#define SCHED_RR 2 + +/* + * This is an additional bit set when we want to + * yield the CPU for one re-schedule.. + */ +#define SCHED_YIELD 0x10 + +struct sched_param { + int sched_priority; +}; + +struct completion; + +#ifdef __KERNEL__ + +#include <linux/spinlock.h> + +/* + * This serializes "schedule()" and also protects + * the run-queue from deletions/modifications (but + * _adding_ to the beginning of the run-queue has + * a separate lock). + */ +extern rwlock_t tasklist_lock; +extern spinlock_t runqueue_lock; +extern spinlock_t mmlist_lock; + +extern void sched_init(void); +extern void init_idle(void); +extern void show_state(void); +extern void cpu_init (void); +extern void trap_init(void); +extern void update_process_times(int user); +extern void update_one_process(struct task_struct *p, unsigned long user, + unsigned long system, int cpu); + +#define MAX_SCHEDULE_TIMEOUT LONG_MAX +extern signed long FASTCALL(schedule_timeout(signed long timeout)); +asmlinkage void schedule(void); + +extern int schedule_task(struct tq_struct *task); +extern void flush_scheduled_tasks(void); +extern int start_context_thread(void); +extern int current_is_keventd(void); +extern void force_cpu_reschedule(int cpu); + +/* + * The default fd array needs to be at least BITS_PER_LONG, + * as this is the granularity returned by copy_fdset(). + */ +#define NR_OPEN_DEFAULT BITS_PER_LONG + +/* + * Open file table structure + */ +struct files_struct { + atomic_t count; + rwlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */ + int max_fds; + int max_fdset; + int next_fd; + struct file ** fd; /* current fd array */ + fd_set *close_on_exec; + fd_set *open_fds; + fd_set close_on_exec_init; + fd_set open_fds_init; + struct file * fd_array[NR_OPEN_DEFAULT]; +}; + +#define INIT_FILES \ +{ \ + count: ATOMIC_INIT(1), \ + file_lock: RW_LOCK_UNLOCKED, \ + max_fds: NR_OPEN_DEFAULT, \ + max_fdset: __FD_SETSIZE, \ + next_fd: 0, \ + fd: &init_files.fd_array[0], \ + close_on_exec: &init_files.close_on_exec_init, \ + open_fds: &init_files.open_fds_init, \ + close_on_exec_init: { { 0, } }, \ + open_fds_init: { { 0, } }, \ + fd_array: { NULL, } \ +} + +/* Maximum number of active map areas.. This is a random (large) number */ +#define MAX_MAP_COUNT (65536) + +struct mm_struct { + struct vm_area_struct * mmap; /* list of VMAs */ + rb_root_t mm_rb; + struct vm_area_struct * mmap_cache; /* last find_vma result */ + pgd_t * pgd; + atomic_t mm_users; /* How many users with user space? */ + atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + int map_count; /* number of VMAs */ + struct rw_semaphore mmap_sem; + spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ + + struct list_head mmlist; /* List of all active mm's. These are globally strung + * together off init_mm.mmlist, and are protected + * by mmlist_lock + */ + + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long rss, total_vm, locked_vm; + unsigned long def_flags; + unsigned long cpu_vm_mask; + unsigned long swap_address; + + unsigned dumpable:1; + + /* Architecture-specific MM context */ + mm_context_t context; +}; + +extern int mmlist_nr; + +#define INIT_MM(name) \ +{ \ + mm_rb: RB_ROOT, \ + pgd: swapper_pg_dir, \ + mm_users: ATOMIC_INIT(2), \ + mm_count: ATOMIC_INIT(1), \ + mmap_sem: RWSEM_INITIALIZER(name.mmap_sem), \ + page_table_lock: SPIN_LOCK_UNLOCKED, \ + mmlist: LIST_HEAD_INIT(name.mmlist), \ +} + +struct signal_struct { + atomic_t count; + struct k_sigaction action[_NSIG]; + spinlock_t siglock; +}; + + +#define INIT_SIGNALS { \ + count: ATOMIC_INIT(1), \ + action: { {{0,}}, }, \ + siglock: SPIN_LOCK_UNLOCKED \ +} + +/* + * Some day this will be a full-fledged user tracking system.. + */ +struct user_struct { + atomic_t __count; /* reference count */ + atomic_t processes; /* How many processes does this user have? */ + atomic_t files; /* How many open files does this user have? */ + + /* Hash table maintenance information */ + struct user_struct *next, **pprev; + uid_t uid; +}; + +#define get_current_user() ({ \ + struct user_struct *__user = current->user; \ + atomic_inc(&__user->__count); \ + __user; }) + +extern struct user_struct root_user; +#define INIT_USER (&root_user) + +struct zone_struct; + +struct local_pages { + struct list_head list; + unsigned int order, nr; + struct zone_struct * classzone; +}; + +struct task_struct { + /* + * offsets of these are hardcoded elsewhere - touch with care + */ + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ + unsigned long flags; /* per process flags, defined below */ + int sigpending; + mm_segment_t addr_limit; /* thread address space: + 0-0xBFFFFFFF for user-thead + 0-0xFFFFFFFF for kernel-thread + */ + struct exec_domain *exec_domain; + volatile long need_resched; + unsigned long ptrace; + + int lock_depth; /* Lock depth */ + +/* + * offset 32 begins here on 32-bit platforms. We keep + * all fields in a single cacheline that are needed for + * the goodness() loop in schedule(). + */ + volatile int counter; + int nice; + unsigned int policy; + struct mm_struct *mm; + int has_cpu, processor; + unsigned long cpus_allowed; + /* + * (only the 'next' pointer fits into the cacheline, but + * that's just fine.) + */ + struct list_head run_list; +#ifdef CONFIG_NUMA_SCHED + int nid; +#endif + int get_child_timeslice; + struct task_struct *next_task, *prev_task; + struct mm_struct *active_mm; + struct rw_sem_recursor mm_recursor; + struct local_pages local_pages; + +/* task state */ + struct linux_binfmt *binfmt; + int exit_code, exit_signal; + int pdeath_signal; /* The signal sent when the parent dies */ + /* ??? */ + unsigned long personality; + int did_exec:1; + pid_t pid; + pid_t pgrp; + pid_t tty_old_pgrp; + pid_t session; + pid_t tgid; + /* boolean value for session group leader */ + int leader; + /* + * pointers to (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with + * p->p_pptr->pid) + */ + struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; + struct list_head thread_group; + + /* PID hash table linkage. */ + struct task_struct *pidhash_next; + struct task_struct **pidhash_pprev; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + struct completion *vfork_done; /* for vfork() */ + unsigned long rt_priority; + unsigned long it_real_value, it_prof_value, it_virt_value; + unsigned long it_real_incr, it_prof_incr, it_virt_incr; + struct timer_list real_timer; + struct tms times; + unsigned long start_time; + long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; +/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ + unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + int swappable:1; +/* process credentials */ + uid_t uid,euid,suid,fsuid; + gid_t gid,egid,sgid,fsgid; + int ngroups; + gid_t groups[NGROUPS]; + kernel_cap_t cap_effective, cap_inheritable, cap_permitted; + int keep_capabilities:1; + struct user_struct *user; +/* limits */ + struct rlimit rlim[RLIM_NLIMITS]; + unsigned short used_math; + char comm[16]; +/* file system info */ + int link_count, total_link_count; + struct tty_struct *tty; /* NULL if no tty */ + unsigned int locks; /* How many file locks are being held */ +/* ipc stuff */ + struct sem_undo *semundo; + struct sem_queue *semsleeping; +/* CPU-specific state of this task */ + struct thread_struct thread; +/* filesystem information */ + struct fs_struct *fs; +/* open file information */ + struct files_struct *files; +/* signal handlers */ + spinlock_t sigmask_lock; /* Protects signal and blocked */ + struct signal_struct *sig; + + sigset_t blocked; + struct sigpending pending; + + unsigned long sas_ss_sp; + size_t sas_ss_size; + int (*notifier)(void *priv); + void *notifier_data; + sigset_t *notifier_mask; + + /* TUX state */ + void *tux_info; + void (*tux_exit)(void); + +/* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; +/* Protection of (de-)allocation: mm, files, fs, tty */ + spinlock_t alloc_lock; +}; + +/* + * Per process flags + */ +#define PF_EXITING (1UL<<0) /* getting shut down */ +#define PF_FORKNOEXEC (1UL<<1) /* forked but didn't exec */ +#define PF_SUPERPRIV (1UL<<2) /* used super-user privileges */ +#define PF_DUMPCORE (1UL<<3) /* dumped core */ +#define PF_SIGNALED (1UL<<4) /* killed by a signal */ +#define PF_MEMALLOC (1UL<<5) /* Allocating memory */ +#define PF_USEDFPU (1UL<<6) /* task used FPU this quantum (SMP) */ +#define PF_ATOMICALLOC (1UL<<7) /* do not block during memalloc */ +#define PF_FREE_PAGES (1UL<<8) /* per process page freeing */ + + +/* + * Ptrace flags + */ + +#define PT_PTRACED 0x00000001 +#define PT_TRACESYS 0x00000002 +#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */ +#define PT_TRACESYSGOOD 0x00000008 +#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */ + +/* + * Limit the stack by to some sane default: root can always + * increase this limit if needed.. 8MB seems reasonable. + */ +#define _STK_LIM (8*1024*1024) + +#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ +#define MAX_COUNTER (20*HZ/100) +#define DEF_NICE (0) + + +/* + * The default (Linux) execution domain. + */ +extern struct exec_domain default_exec_domain; + +/* + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) + */ +#define INIT_TASK(tsk) \ +{ \ + state: 0, \ + flags: 0, \ + sigpending: 0, \ + addr_limit: KERNEL_DS, \ + exec_domain: &default_exec_domain, \ + lock_depth: -1, \ + counter: DEF_COUNTER, \ + nice: DEF_NICE, \ + policy: SCHED_OTHER, \ + mm: NULL, \ + active_mm: &init_mm, \ + mm_recursor: RWSEM_RECURSOR_INITIALIZER, \ + cpus_allowed: -1UL, \ + run_list: LIST_HEAD_INIT(tsk.run_list), \ + next_task: &tsk, \ + prev_task: &tsk, \ + p_opptr: &tsk, \ + p_pptr: &tsk, \ + thread_group: LIST_HEAD_INIT(tsk.thread_group), \ + wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ + real_timer: { \ + function: it_real_fn \ + }, \ + cap_effective: CAP_INIT_EFF_SET, \ + cap_inheritable: CAP_INIT_INH_SET, \ + cap_permitted: CAP_FULL_SET, \ + keep_capabilities: 0, \ + rlim: INIT_RLIMITS, \ + user: INIT_USER, \ + comm: "swapper", \ + thread: INIT_THREAD, \ + fs: &init_fs, \ + files: &init_files, \ + sigmask_lock: SPIN_LOCK_UNLOCKED, \ + sig: &init_signals, \ + pending: { NULL, &tsk.pending.head, {{0}}}, \ + blocked: {{0}}, \ + alloc_lock: SPIN_LOCK_UNLOCKED \ +} + + +#ifndef INIT_TASK_SIZE +# define INIT_TASK_SIZE 2048*sizeof(long) +#endif + +union task_union { + struct task_struct task; + unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; +}; + +extern union task_union init_task_union; + +extern struct mm_struct init_mm; +extern struct task_struct *init_tasks[NR_CPUS]; + +/* PID hashing. (shouldnt this be dynamic?) */ +#define PIDHASH_SZ (4096 >> 2) +extern struct task_struct *pidhash[PIDHASH_SZ]; + +#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) + +static inline void hash_pid(struct task_struct *p) +{ + struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; + + if((p->pidhash_next = *htable) != NULL) + (*htable)->pidhash_pprev = &p->pidhash_next; + *htable = p; + p->pidhash_pprev = htable; +} + +static inline void unhash_pid(struct task_struct *p) +{ + if(p->pidhash_next) + p->pidhash_next->pidhash_pprev = p->pidhash_pprev; + *p->pidhash_pprev = p->pidhash_next; +} + +static inline struct task_struct *find_task_by_pid(int pid) +{ + struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; + + for(p = *htable; p && p->pid != pid; p = p->pidhash_next) + ; + + return p; +} + +/* per-UID process charging. */ +extern struct user_struct * alloc_uid(uid_t); +extern void free_uid(struct user_struct *); + +#include <asm/current.h> + +extern unsigned long volatile jiffies; +extern unsigned long itimer_ticks; +extern unsigned long itimer_next; +extern volatile struct timeval xtime; +extern void do_timer(struct pt_regs *); + +extern unsigned int * prof_buffer; +extern unsigned long prof_len; +extern unsigned long prof_shift; + +#define CURRENT_TIME (xtime.tv_sec) + +extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr)); +extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); +extern void FASTCALL(sleep_on(wait_queue_head_t *q)); +extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); +extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); +extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); +extern int FASTCALL(wake_up_process(struct task_struct * tsk)); + +#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) +#define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) +#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) +#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) +#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) +#define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) +#define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) +#define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#define wake_up_interruptible_sync_nr(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) +asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); + +extern int in_group_p(gid_t); +extern int in_egroup_p(gid_t); + +extern void proc_caches_init(void); +extern void flush_signals(struct task_struct *); +extern void flush_signal_handlers(struct task_struct *); +extern int dequeue_signal(sigset_t *, siginfo_t *); +extern void block_all_signals(int (*notifier)(void *priv), void *priv, + sigset_t *mask); +extern void unblock_all_signals(void); +extern int send_sig_info(int, struct siginfo *, struct task_struct *); +extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int kill_pg_info(int, struct siginfo *, pid_t); +extern int kill_sl_info(int, struct siginfo *, pid_t); +extern int kill_proc_info(int, struct siginfo *, pid_t); +extern void notify_parent(struct task_struct *, int); +extern void do_notify_parent(struct task_struct *, int); +extern void force_sig(int, struct task_struct *); +extern int send_sig(int, struct task_struct *, int); +extern int kill_pg(pid_t, int, int); +extern int kill_sl(pid_t, int, int); +extern int kill_proc(pid_t, int, int); +extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); +extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); + +static inline int signal_pending(struct task_struct *p) +{ + return (p->sigpending != 0); +} + +/* + * Re-calculate pending state from the set of locally pending + * signals, globally pending signals, and blocked signals. + */ +static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) +{ + unsigned long ready; + long i; + + switch (_NSIG_WORDS) { + default: + for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) + ready |= signal->sig[i] &~ blocked->sig[i]; + break; + + case 4: ready = signal->sig[3] &~ blocked->sig[3]; + ready |= signal->sig[2] &~ blocked->sig[2]; + ready |= signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 2: ready = signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 1: ready = signal->sig[0] &~ blocked->sig[0]; + } + return ready != 0; +} + +/* Reevaluate whether the task has signals pending delivery. + This is required every time the blocked sigset_t changes. + All callers should have t->sigmask_lock. */ + +static inline void recalc_sigpending(struct task_struct *t) +{ + t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); +} + +/* True if we are on the alternate signal stack. */ + +static inline int on_sig_stack(unsigned long sp) +{ + return (sp - current->sas_ss_sp < current->sas_ss_size); +} + +static inline int sas_ss_flags(unsigned long sp) +{ + return (current->sas_ss_size == 0 ? SS_DISABLE + : on_sig_stack(sp) ? SS_ONSTACK : 0); +} + +extern int request_irq(unsigned int, + void (*handler)(int, void *, struct pt_regs *), + unsigned long, const char *, void *); +extern void free_irq(unsigned int, void *); + +/* + * This has now become a routine instead of a macro, it sets a flag if + * it returns true (to do BSD-style accounting where the process is flagged + * if it uses root privs). The implication of this is that you should do + * normal permissions checks first, and check suser() last. + * + * [Dec 1997 -- Chris Evans] + * For correctness, the above considerations need to be extended to + * fsuser(). This is done, along with moving fsuser() checks to be + * last. + * + * These will be removed, but in the mean time, when the SECURE_NOROOT + * flag is set, uids don't grant privilege. + */ +static inline int suser(void) +{ + if (!issecure(SECURE_NOROOT) && current->euid == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} + +static inline int fsuser(void) +{ + if (!issecure(SECURE_NOROOT) && current->fsuid == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} + +/* + * capable() checks for a particular capability. + * New privilege checks should use this interface, rather than suser() or + * fsuser(). See include/linux/capability.h for defined capabilities. + */ + +static inline int capable(int cap) +{ +#if 1 /* ok now */ + if (cap_raised(current->cap_effective, cap)) +#else + if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0) +#endif + { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} + +/* + * Routines for handling mm_structs + */ +extern struct mm_struct * mm_alloc(void); + +extern struct mm_struct * start_lazy_tlb(void); +extern void end_lazy_tlb(struct mm_struct *mm); + +/* mmdrop drops the mm and the page tables */ +extern inline void FASTCALL(__mmdrop(struct mm_struct *)); +static inline void mmdrop(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop(mm); +} + +/* mmput gets rid of the mappings and all user-space */ +extern void mmput(struct mm_struct *); +/* Remove the current tasks stale references to the old mm_struct */ +extern void mm_release(void); + +/* + * Routines for handling the fd arrays + */ +extern struct file ** alloc_fd_array(int); +extern int expand_fd_array(struct files_struct *, int nr); +extern void free_fd_array(struct file **, int); + +extern fd_set *alloc_fdset(int); +extern int expand_fdset(struct files_struct *, int nr); +extern void free_fdset(fd_set *, int); + +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern void flush_thread(void); +extern void exit_thread(void); + +extern void exit_mm(struct task_struct *); +extern void exit_files(struct task_struct *); +extern void exit_sighand(struct task_struct *); + +extern void reparent_to_init(void); +extern void daemonize(void); + +extern int do_execve(char *, char **, char **, struct pt_regs *); +extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); + +extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); + +#define nr_running_inc() \ +do { \ + numa_nr_running_inc(); \ + nr_running++; \ +} while (0) + +#define nr_running_dec() \ +do { \ + numa_nr_running_dec(); \ + nr_running--; \ +} while (0) + +#define nr_threads_inc() \ +do { \ + numa_nr_threads_inc(); \ + nr_threads++; \ +} while (0) + +#define nr_threads_dec() \ +do { \ + numa_nr_threads_dec(); \ + nr_threads--; \ +} while (0) + +#define __wait_event(wq, condition) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + schedule(); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event(wq, condition) \ +do { \ + if (condition) \ + break; \ + __wait_event(wq, condition); \ +} while (0) + +#define __wait_event_interruptible(wq, condition, ret) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (condition) \ + break; \ + if (!signal_pending(current)) { \ + schedule(); \ + continue; \ + } \ + ret = -ERESTARTSYS; \ + break; \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_interruptible(wq, condition) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __wait_event_interruptible(wq, condition, __ret); \ + __ret; \ +}) + +#define REMOVE_LINKS(p) do { \ + (p)->next_task->prev_task = (p)->prev_task; \ + (p)->prev_task->next_task = (p)->next_task; \ + if ((p)->p_osptr) \ + (p)->p_osptr->p_ysptr = (p)->p_ysptr; \ + if ((p)->p_ysptr) \ + (p)->p_ysptr->p_osptr = (p)->p_osptr; \ + else \ + (p)->p_pptr->p_cptr = (p)->p_osptr; \ + } while (0) + +#define SET_LINKS(p) do { \ + (p)->next_task = &init_task; \ + (p)->prev_task = init_task.prev_task; \ + init_task.prev_task->next_task = (p); \ + init_task.prev_task = (p); \ + (p)->p_ysptr = NULL; \ + if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \ + (p)->p_osptr->p_ysptr = p; \ + (p)->p_pptr->p_cptr = p; \ + } while (0) + +#define for_each_task(p) \ + for (p = &init_task ; (p = p->next_task) != &init_task ; ) + +#define next_thread(p) \ + list_entry((p)->thread_group.next, struct task_struct, thread_group) + +#define del_from_runqueue(p) \ +do { \ + nr_running_dec(); \ + list_del(&(p)->run_list); \ + (p)->run_list.next = NULL; \ +} while(0) + +static inline int task_on_runqueue(struct task_struct *p) +{ + return (p->run_list.next != NULL); +} + +#define unhash_process(p) \ +do { \ + if (task_on_runqueue(p)) BUG(); \ + write_lock_irq(&tasklist_lock); \ + nr_threads_dec(); \ + unhash_pid(p); \ + REMOVE_LINKS(p); \ + list_del(&(p)->thread_group); \ + write_unlock_irq(&tasklist_lock); \ +} while(0) + +/* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ +static inline void task_lock(struct task_struct *p) +{ + spin_lock(&p->alloc_lock); +} + +static inline void task_unlock(struct task_struct *p) +{ + spin_unlock(&p->alloc_lock); +} + +/* write full pathname into buffer and return start of pathname */ +static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, + char *buf, int buflen) +{ + char *res; + struct vfsmount *rootmnt; + struct dentry *root; + read_lock(¤t->fs->lock); + rootmnt = mntget(current->fs->rootmnt); + root = dget(current->fs->root); + read_unlock(¤t->fs->lock); + spin_lock(&dcache_lock); + res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen); + spin_unlock(&dcache_lock); + dput(root); + mntput(rootmnt); + return res; +} + +#endif /* __KERNEL__ */ + +#endif |