summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Robbins <drobbins@gentoo.org>2001-11-16 22:12:25 +0000
committerDaniel Robbins <drobbins@gentoo.org>2001-11-16 22:12:25 +0000
commit3a7632d575ac3a60c6ca9541d0b48e09c881c38e (patch)
tree1ee37744646d132a6ddab94faafaa66579a2584c /sys-kernel
parentifixo (diff)
downloadgentoo-2-3a7632d575ac3a60c6ca9541d0b48e09c881c38e.tar.gz
gentoo-2-3a7632d575ac3a60c6ca9541d0b48e09c881c38e.tar.bz2
gentoo-2-3a7632d575ac3a60c6ca9541d0b48e09c881c38e.zip
missing stuffs
Diffstat (limited to 'sys-kernel')
-rw-r--r--sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/array.c698
-rw-r--r--sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/filemap.c3143
-rw-r--r--sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/sched.h944
3 files changed, 4785 insertions, 0 deletions
diff --git a/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/array.c b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/array.c
new file mode 100644
index 000000000000..188ce6b49953
--- /dev/null
+++ b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/array.c
@@ -0,0 +1,698 @@
+/*
+ * linux/fs/proc/array.c
+ *
+ * Copyright (C) 1992 by Linus Torvalds
+ * based on ideas by Darren Senn
+ *
+ * Fixes:
+ * Michael. K. Johnson: stat,statm extensions.
+ * <johnsonm@stolaf.edu>
+ *
+ * Pauline Middelink : Made cmdline,envline only break at '\0's, to
+ * make sure SET_PROCTITLE works. Also removed
+ * bad '!' which forced address recalculation for
+ * EVERY character on the current page.
+ * <middelin@polyware.iaf.nl>
+ *
+ * Danny ter Haar : added cpuinfo
+ * <dth@cistron.nl>
+ *
+ * Alessandro Rubini : profile extension.
+ * <rubini@ipvvis.unipv.it>
+ *
+ * Jeff Tranter : added BogoMips field to cpuinfo
+ * <Jeff_Tranter@Mitel.COM>
+ *
+ * Bruno Haible : remove 4K limit for the maps file
+ * <haible@ma2s2.mathematik.uni-karlsruhe.de>
+ *
+ * Yves Arrouye : remove removal of trailing spaces in get_array.
+ * <Yves.Arrouye@marin.fdn.fr>
+ *
+ * Jerome Forissier : added per-CPU time information to /proc/stat
+ * and /proc/<pid>/cpu extension
+ * <forissier@isia.cma.fr>
+ * - Incorporation and non-SMP safe operation
+ * of forissier patch in 2.1.78 by
+ * Hans Marcus <crowbar@concepts.nl>
+ *
+ * aeb@cwi.nl : /proc/partitions
+ *
+ *
+ * Alan Cox : security fixes.
+ * <Alan.Cox@linux.org>
+ *
+ * Al Viro : safe handling of mm_struct
+ *
+ * Gerhard Wichert : added BIGMEM support
+ * Siemens AG <Gerhard.Wichert@pdb.siemens.de>
+ *
+ * Al Viro & Jeff Garzik : moved most of the thing into base.c and
+ * : proc_misc.c. The rest may eventually go into
+ * : base.c too.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/tty.h>
+#include <linux/string.h>
+#include <linux/mman.h>
+#include <linux/proc_fs.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/signal.h>
+#include <linux/highmem.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+
+/* Gcc optimizes away "strlen(x)" for constant x */
+#define ADDBUF(buffer, string) \
+do { memcpy(buffer, string, strlen(string)); \
+ buffer += strlen(string); } while (0)
+
+static inline char * task_name(struct task_struct *p, char * buf)
+{
+ int i;
+ char * name;
+
+ ADDBUF(buf, "Name:\t");
+ name = p->comm;
+ i = sizeof(p->comm);
+ do {
+ unsigned char c = *name;
+ name++;
+ i--;
+ *buf = c;
+ if (!c)
+ break;
+ if (c == '\\') {
+ buf[1] = c;
+ buf += 2;
+ continue;
+ }
+ if (c == '\n') {
+ buf[0] = '\\';
+ buf[1] = 'n';
+ buf += 2;
+ continue;
+ }
+ buf++;
+ } while (i);
+ *buf = '\n';
+ return buf+1;
+}
+
+/*
+ * The task state array is a strange "bitmap" of
+ * reasons to sleep. Thus "running" is zero, and
+ * you can test for combinations of others with
+ * simple bit tests.
+ */
+static const char *task_state_array[] = {
+ "R (running)", /* 0 */
+ "S (sleeping)", /* 1 */
+ "D (disk sleep)", /* 2 */
+ "Z (zombie)", /* 4 */
+ "T (stopped)", /* 8 */
+ "W (paging)" /* 16 */
+};
+
+static inline const char * get_task_state(struct task_struct *tsk)
+{
+ unsigned int state = tsk->state & (TASK_RUNNING |
+ TASK_INTERRUPTIBLE |
+ TASK_UNINTERRUPTIBLE |
+ TASK_ZOMBIE |
+ TASK_STOPPED);
+ const char **p = &task_state_array[0];
+
+ while (state) {
+ p++;
+ state >>= 1;
+ }
+ return *p;
+}
+
+static inline char * task_state(struct task_struct *p, char *buffer)
+{
+ int g;
+
+ read_lock(&tasklist_lock);
+ buffer += sprintf(buffer,
+ "State:\t%s\n"
+ "Tgid:\t%d\n"
+ "Pid:\t%d\n"
+ "PPid:\t%d\n"
+ "TracerPid:\t%d\n"
+ "Uid:\t%d\t%d\t%d\t%d\n"
+ "Gid:\t%d\t%d\t%d\t%d\n",
+ get_task_state(p), p->tgid,
+ p->pid, p->pid ? p->p_opptr->pid : 0, 0,
+ p->uid, p->euid, p->suid, p->fsuid,
+ p->gid, p->egid, p->sgid, p->fsgid);
+ read_unlock(&tasklist_lock);
+ task_lock(p);
+ buffer += sprintf(buffer,
+ "FDSize:\t%d\n"
+ "Groups:\t",
+ p->files ? p->files->max_fds : 0);
+ task_unlock(p);
+
+ for (g = 0; g < p->ngroups; g++)
+ buffer += sprintf(buffer, "%d ", p->groups[g]);
+
+ buffer += sprintf(buffer, "\n");
+ return buffer;
+}
+
+static inline char * task_mem(struct mm_struct *mm, char *buffer)
+{
+ struct vm_area_struct * vma;
+ unsigned long data = 0, stack = 0;
+ unsigned long exec = 0, lib = 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long len = (vma->vm_end - vma->vm_start) >> 10;
+ if (!vma->vm_file) {
+ data += len;
+ if (vma->vm_flags & VM_GROWSDOWN)
+ stack += len;
+ continue;
+ }
+ if (vma->vm_flags & VM_WRITE)
+ continue;
+ if (vma->vm_flags & VM_EXEC) {
+ exec += len;
+ if (vma->vm_flags & VM_EXECUTABLE)
+ continue;
+ lib += len;
+ }
+ }
+ buffer += sprintf(buffer,
+ "VmSize:\t%8lu kB\n"
+ "VmLck:\t%8lu kB\n"
+ "VmRSS:\t%8lu kB\n"
+ "VmData:\t%8lu kB\n"
+ "VmStk:\t%8lu kB\n"
+ "VmExe:\t%8lu kB\n"
+ "VmLib:\t%8lu kB\n",
+ mm->total_vm << (PAGE_SHIFT-10),
+ mm->locked_vm << (PAGE_SHIFT-10),
+ mm->rss << (PAGE_SHIFT-10),
+ data - stack, stack,
+ exec - lib, lib);
+ up_read(&mm->mmap_sem);
+ return buffer;
+}
+
+static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
+ sigset_t *catch)
+{
+ struct k_sigaction *k;
+ int i;
+
+ sigemptyset(ign);
+ sigemptyset(catch);
+
+ if (p->sig) {
+ k = p->sig->action;
+ for (i = 1; i <= _NSIG; ++i, ++k) {
+ if (k->sa.sa_handler == SIG_IGN)
+ sigaddset(ign, i);
+ else if (k->sa.sa_handler != SIG_DFL)
+ sigaddset(catch, i);
+ }
+ }
+}
+
+static inline char * task_sig(struct task_struct *p, char *buffer)
+{
+ sigset_t ign, catch;
+
+ buffer += sprintf(buffer, "SigPnd:\t");
+ buffer = render_sigset_t(&p->pending.signal, buffer);
+ *buffer++ = '\n';
+ buffer += sprintf(buffer, "SigBlk:\t");
+ buffer = render_sigset_t(&p->blocked, buffer);
+ *buffer++ = '\n';
+
+ collect_sigign_sigcatch(p, &ign, &catch);
+ buffer += sprintf(buffer, "SigIgn:\t");
+ buffer = render_sigset_t(&ign, buffer);
+ *buffer++ = '\n';
+ buffer += sprintf(buffer, "SigCgt:\t"); /* Linux 2.0 uses "SigCgt" */
+ buffer = render_sigset_t(&catch, buffer);
+ *buffer++ = '\n';
+
+ return buffer;
+}
+
+static inline char *task_cap(struct task_struct *p, char *buffer)
+{
+ return buffer + sprintf(buffer, "CapInh:\t%016x\n"
+ "CapPrm:\t%016x\n"
+ "CapEff:\t%016x\n",
+ cap_t(p->cap_inheritable),
+ cap_t(p->cap_permitted),
+ cap_t(p->cap_effective));
+}
+
+
+int proc_pid_status(struct task_struct *task, char * buffer)
+{
+ char * orig = buffer;
+ struct mm_struct *mm;
+
+ buffer = task_name(task, buffer);
+ buffer = task_state(task, buffer);
+ task_lock(task);
+ mm = task->mm;
+ if(mm)
+ atomic_inc(&mm->mm_users);
+ task_unlock(task);
+ if (mm) {
+ buffer = task_mem(mm, buffer);
+ mmput(mm);
+ }
+ buffer = task_sig(task, buffer);
+ buffer = task_cap(task, buffer);
+#if defined(CONFIG_ARCH_S390)
+ buffer = task_show_regs(task, buffer);
+#endif
+ return buffer - orig;
+}
+
+int proc_pid_stat(struct task_struct *task, char * buffer)
+{
+ unsigned long vsize, eip, esp, wchan;
+ long priority, nice;
+ int tty_pgrp = -1, tty_nr = 0;
+ sigset_t sigign, sigcatch;
+ char state;
+ int res;
+ pid_t ppid;
+ struct mm_struct *mm;
+
+ state = *get_task_state(task);
+ vsize = eip = esp = 0;
+ task_lock(task);
+ mm = task->mm;
+ if(mm)
+ atomic_inc(&mm->mm_users);
+ if (task->tty) {
+ tty_pgrp = task->tty->pgrp;
+ tty_nr = kdev_t_to_nr(task->tty->device);
+ }
+ task_unlock(task);
+ if (mm) {
+ struct vm_area_struct *vma;
+ down_read(&mm->mmap_sem);
+ vma = mm->mmap;
+ while (vma) {
+ vsize += vma->vm_end - vma->vm_start;
+ vma = vma->vm_next;
+ }
+ eip = KSTK_EIP(task);
+ esp = KSTK_ESP(task);
+ up_read(&mm->mmap_sem);
+ }
+
+ wchan = get_wchan(task);
+
+ collect_sigign_sigcatch(task, &sigign, &sigcatch);
+
+ /* scale priority and nice values from timeslices to -20..20 */
+ /* to make it look like a "normal" Unix priority/nice value */
+ priority = task->counter;
+ priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
+ nice = task->nice;
+
+ read_lock(&tasklist_lock);
+ ppid = task->pid ? task->p_opptr->pid : 0;
+ read_unlock(&tasklist_lock);
+ res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
+%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d\n",
+ task->pid,
+ task->comm,
+ state,
+ ppid,
+ task->pgrp,
+ task->session,
+ tty_nr,
+ tty_pgrp,
+ task->flags,
+ task->min_flt,
+ task->cmin_flt,
+ task->maj_flt,
+ task->cmaj_flt,
+ task->times.tms_utime,
+ task->times.tms_stime,
+ task->times.tms_cutime,
+ task->times.tms_cstime,
+ priority,
+ nice,
+ 0UL /* removed */,
+ task->it_real_value,
+ task->start_time,
+ vsize,
+ mm ? mm->rss : 0, /* you might want to shift this left 3 */
+ task->rlim[RLIMIT_RSS].rlim_cur,
+ mm ? mm->start_code : 0,
+ mm ? mm->end_code : 0,
+ mm ? mm->start_stack : 0,
+ esp,
+ eip,
+ /* The signal information here is obsolete.
+ * It must be decimal for Linux 2.0 compatibility.
+ * Use /proc/#/status for real-time signals.
+ */
+ task->pending.signal.sig[0] & 0x7fffffffUL,
+ task->blocked.sig[0] & 0x7fffffffUL,
+ sigign .sig[0] & 0x7fffffffUL,
+ sigcatch .sig[0] & 0x7fffffffUL,
+ wchan,
+ task->nswap,
+ task->cnswap,
+ task->exit_signal,
+ task->processor);
+ if(mm)
+ mmput(mm);
+ return res;
+}
+
+static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
+ int * pages, int * shared, int * dirty, int * total)
+{
+ pte_t * pte;
+ unsigned long end;
+
+ if (pmd_none(*pmd))
+ return;
+ if (pmd_bad(*pmd)) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ return;
+ }
+ pte = pte_offset(pmd, address);
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+ do {
+ pte_t page;
+ struct page *ptpage;
+
+ conditional_schedule();
+ page=*pte;
+ address += PAGE_SIZE;
+ pte++;
+ if (pte_none(page))
+ continue;
+ ++*total;
+ if (!pte_present(page))
+ continue;
+ ptpage = pte_page(page);
+ if ((!VALID_PAGE(ptpage)) || PageReserved(ptpage))
+ continue;
+ ++*pages;
+ if (pte_dirty(page))
+ ++*dirty;
+ if (page_count(pte_page(page)) > 1)
+ ++*shared;
+ } while (address < end);
+}
+
+static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size,
+ int * pages, int * shared, int * dirty, int * total)
+{
+ pmd_t * pmd;
+ unsigned long end;
+
+ if (pgd_none(*pgd))
+ return;
+ if (pgd_bad(*pgd)) {
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+ return;
+ }
+ pmd = pmd_offset(pgd, address);
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ do {
+ statm_pte_range(pmd, address, end - address, pages, shared, dirty, total);
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+}
+
+static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end,
+ int * pages, int * shared, int * dirty, int * total)
+{
+ while (address < end) {
+ statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total);
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ pgd++;
+ }
+}
+
+int proc_pid_statm(struct task_struct *task, char * buffer)
+{
+ struct mm_struct *mm;
+ int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0;
+
+ task_lock(task);
+ mm = task->mm;
+ if(mm)
+ atomic_inc(&mm->mm_users);
+ task_unlock(task);
+ if (mm) {
+ struct vm_area_struct * vma;
+ down_read(&mm->mmap_sem);
+ vma = mm->mmap;
+ while (vma) {
+ pgd_t *pgd = pgd_offset(mm, vma->vm_start);
+ int pages = 0, shared = 0, dirty = 0, total = 0;
+
+ statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
+ resident += pages;
+ share += shared;
+ dt += dirty;
+ size += total;
+ if (vma->vm_flags & VM_EXECUTABLE)
+ trs += pages; /* text */
+ else if (vma->vm_flags & VM_GROWSDOWN)
+ drs += pages; /* stack */
+ else if (vma->vm_end > 0x60000000)
+ lrs += pages; /* library */
+ else
+ drs += pages;
+ vma = vma->vm_next;
+ }
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ }
+ return sprintf(buffer,"%d %d %d %d %d %d %d\n",
+ size, resident, share, trs, lrs, drs, dt);
+}
+
+/*
+ * The way we support synthetic files > 4K
+ * - without storing their contents in some buffer and
+ * - without walking through the entire synthetic file until we reach the
+ * position of the requested data
+ * is to cleverly encode the current position in the file's f_pos field.
+ * There is no requirement that a read() call which returns `count' bytes
+ * of data increases f_pos by exactly `count'.
+ *
+ * This idea is Linus' one. Bruno implemented it.
+ */
+
+/*
+ * For the /proc/<pid>/maps file, we use fixed length records, each containing
+ * a single line.
+ *
+ * f_pos = (number of the vma in the task->mm->mmap list) * PAGE_SIZE
+ * + (index into the line)
+ */
+/* for systems with sizeof(void*) == 4: */
+#define MAPS_LINE_FORMAT4 "%08lx-%08lx %s %08lx %s %lu"
+#define MAPS_LINE_MAX4 49 /* sum of 8 1 8 1 4 1 8 1 5 1 10 1 */
+
+/* for systems with sizeof(void*) == 8: */
+#define MAPS_LINE_FORMAT8 "%016lx-%016lx %s %016lx %s %lu"
+#define MAPS_LINE_MAX8 73 /* sum of 16 1 16 1 4 1 16 1 5 1 10 1 */
+
+#define MAPS_LINE_FORMAT (sizeof(void*) == 4 ? MAPS_LINE_FORMAT4 : MAPS_LINE_FORMAT8)
+#define MAPS_LINE_MAX (sizeof(void*) == 4 ? MAPS_LINE_MAX4 : MAPS_LINE_MAX8)
+
+static int proc_pid_maps_get_line (char *buf, struct vm_area_struct *map)
+{
+ /* produce the next line */
+ char *line;
+ char str[5];
+ int flags;
+ kdev_t dev;
+ unsigned long ino;
+ int len;
+
+ flags = map->vm_flags;
+
+ str[0] = flags & VM_READ ? 'r' : '-';
+ str[1] = flags & VM_WRITE ? 'w' : '-';
+ str[2] = flags & VM_EXEC ? 'x' : '-';
+ str[3] = flags & VM_MAYSHARE ? 's' : 'p';
+ str[4] = 0;
+
+ dev = 0;
+ ino = 0;
+ if (map->vm_file != NULL) {
+ dev = map->vm_file->f_dentry->d_inode->i_dev;
+ ino = map->vm_file->f_dentry->d_inode->i_ino;
+ line = d_path(map->vm_file->f_dentry,
+ map->vm_file->f_vfsmnt,
+ buf, PAGE_SIZE);
+ buf[PAGE_SIZE-1] = '\n';
+ line -= MAPS_LINE_MAX;
+ if(line < buf)
+ line = buf;
+ } else
+ line = buf;
+
+ len = sprintf(line,
+ MAPS_LINE_FORMAT,
+ map->vm_start, map->vm_end, str, map->vm_pgoff << PAGE_SHIFT,
+ kdevname(dev), ino);
+
+ if(map->vm_file) {
+ int i;
+ for(i = len; i < MAPS_LINE_MAX; i++)
+ line[i] = ' ';
+ len = buf + PAGE_SIZE - line;
+ memmove(buf, line, len);
+ } else
+ line[len++] = '\n';
+ return len;
+}
+
+ssize_t proc_pid_read_maps (struct task_struct *task, struct file * file, char * buf,
+ size_t count, loff_t *ppos)
+{
+ struct mm_struct *mm;
+ struct vm_area_struct * map;
+ char *tmp, *kbuf;
+ long retval;
+ int off, lineno, loff;
+
+ /* reject calls with out of range parameters immediately */
+ retval = 0;
+ if (*ppos > LONG_MAX)
+ goto out;
+ if (count == 0)
+ goto out;
+ off = (long)*ppos;
+ /*
+ * We might sleep getting the page, so get it first.
+ */
+ retval = -ENOMEM;
+ kbuf = (char*)__get_free_page(GFP_KERNEL);
+ if (!kbuf)
+ goto out;
+
+ tmp = (char*)__get_free_page(GFP_KERNEL);
+ if (!tmp)
+ goto out_free1;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm)
+ atomic_inc(&mm->mm_users);
+ task_unlock(task);
+ retval = 0;
+ if (!mm)
+ goto out_free2;
+
+ down_read(&mm->mmap_sem);
+ map = mm->mmap;
+ lineno = 0;
+ loff = 0;
+ if (count > PAGE_SIZE)
+ count = PAGE_SIZE;
+ while (map) {
+ int len;
+ if (off > PAGE_SIZE) {
+ off -= PAGE_SIZE;
+ goto next;
+ }
+ len = proc_pid_maps_get_line(tmp, map);
+ len -= off;
+ if (len > 0) {
+ if (retval+len > count) {
+ /* only partial line transfer possible */
+ len = count - retval;
+ /* save the offset where the next read
+ * must start */
+ loff = len+off;
+ }
+ memcpy(kbuf+retval, tmp+off, len);
+ retval += len;
+ }
+ off = 0;
+next:
+ if (!loff)
+ lineno++;
+ if (retval >= count)
+ break;
+ if (loff) BUG();
+ map = map->vm_next;
+ }
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+
+ if (retval > count) BUG();
+ if (copy_to_user(buf, kbuf, retval))
+ retval = -EFAULT;
+ else
+ *ppos = (lineno << PAGE_SHIFT) + loff;
+
+out_free2:
+ free_page((unsigned long)tmp);
+out_free1:
+ free_page((unsigned long)kbuf);
+out:
+ return retval;
+}
+
+#ifdef CONFIG_SMP
+int proc_pid_cpu(struct task_struct *task, char * buffer)
+{
+ int i, len;
+
+ len = sprintf(buffer,
+ "cpu %lu %lu\n",
+ task->times.tms_utime,
+ task->times.tms_stime);
+
+ for (i = 0 ; i < smp_num_cpus; i++)
+ len += sprintf(buffer + len, "cpu%d %lu %lu\n",
+ i,
+ task->per_cpu_utime[cpu_logical_map(i)],
+ task->per_cpu_stime[cpu_logical_map(i)]);
+
+ return len;
+}
+#endif
diff --git a/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/filemap.c b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/filemap.c
new file mode 100644
index 000000000000..8c98b0d81bf1
--- /dev/null
+++ b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/filemap.c
@@ -0,0 +1,3143 @@
+/*
+ * linux/mm/filemap.c
+ *
+ * Copyright (C) 1994-1999 Linus Torvalds
+ */
+
+/*
+ * This file handles the generic file mmap semantics used by
+ * most "normal" filesystems (but you don't /have/ to use this:
+ * the NFS filesystem used to do this differently, for example)
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/locks.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+#include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/swapctl.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/iobuf.h>
+
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mman.h>
+
+#include <linux/highmem.h>
+
+/*
+* Shared mappings implemented 30.11.1994. It's not fully working yet,
+* though.
+*
+* Shared mappings now work. 15.8.1995 Bruno.
+*
+* finished 'unifying' the page and buffer cache and SMP-threaded the
+* page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+*
+* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
+*/
+
+unsigned long page_cache_size;
+unsigned int page_hash_bits;
+struct page **page_hash_table;
+
+spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED};
+
+/*
+* NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock
+* with the pagecache_lock held.
+*
+* Ordering:
+* swap_lock ->
+* pagemap_lru_lock ->
+* pagecache_lock
+*/
+spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
+
+#define CLUSTER_PAGES (1 << page_cluster)
+#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
+
+static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
+static void add_page_to_hash_queue(struct page * page, struct page **p)
+{
+struct page *next = *p;
+
+*p = page;
+page->next_hash = next;
+page->pprev_hash = p;
+if (next)
+ next->pprev_hash = &page->next_hash;
+if (page->buffers)
+ PAGE_BUG(page);
+inc_nr_cache_pages(page);
+}
+
+static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
+{
+struct list_head *head = &mapping->clean_pages;
+
+mapping->nrpages++;
+list_add(&page->list, head);
+page->mapping = mapping;
+}
+
+static inline void remove_page_from_inode_queue(struct page * page)
+{
+struct address_space * mapping = page->mapping;
+
+mapping->nrpages--;
+list_del(&page->list);
+page->mapping = NULL;
+}
+
+static inline void remove_page_from_hash_queue(struct page * page)
+{
+struct page *next = page->next_hash;
+struct page **pprev = page->pprev_hash;
+
+if (next)
+ next->pprev_hash = pprev;
+*pprev = next;
+page->pprev_hash = NULL;
+dec_nr_cache_pages(page);
+}
+
+/*
+* Remove a page from the page cache and free it. Caller has to make
+* sure the page is locked and that nobody else uses it - or that usage
+* is safe.
+*/
+void __remove_inode_page(struct page *page)
+{
+if (PageDirty(page)) BUG();
+remove_page_from_inode_queue(page);
+remove_page_from_hash_queue(page);
+}
+
+void remove_inode_page(struct page *page)
+{
+if (!PageLocked(page))
+ PAGE_BUG(page);
+
+spin_lock(&pagecache_lock);
+__remove_inode_page(page);
+spin_unlock(&pagecache_lock);
+}
+
+static inline int sync_page(struct page *page)
+{
+struct address_space *mapping = page->mapping;
+
+if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
+ return mapping->a_ops->sync_page(page);
+return 0;
+}
+
+/*
+* Add a page to the dirty page list.
+*/
+void set_page_dirty(struct page *page)
+{
+if (!test_and_set_bit(PG_dirty, &page->flags)) {
+ struct address_space *mapping = page->mapping;
+
+ if (mapping) {
+ spin_lock(&pagecache_lock);
+ list_del(&page->list);
+ list_add(&page->list, &mapping->dirty_pages);
+ spin_unlock(&pagecache_lock);
+
+ if (mapping->host)
+ mark_inode_dirty_pages(mapping->host);
+ }
+}
+}
+
+/**
+* invalidate_inode_pages - Invalidate all the unlocked pages of one inode
+* @inode: the inode which pages we want to invalidate
+*
+* This function only removes the unlocked pages, if you want to
+* remove all the pages of one inode, you must call truncate_inode_pages.
+*/
+
+void invalidate_inode_pages(struct inode * inode)
+{
+struct list_head *head, *curr;
+struct page * page;
+
+head = &inode->i_mapping->clean_pages;
+
+spin_lock(&pagemap_lru_lock);
+spin_lock(&pagecache_lock);
+curr = head->next;
+
+while (curr != head) {
+ page = list_entry(curr, struct page, list);
+ curr = curr->next;
+
+ /* We cannot invalidate something in dirty.. */
+ if (PageDirty(page))
+ continue;
+
+ /* ..or locked */
+ if (TryLockPage(page))
+ continue;
+
+ if (page->buffers && !try_to_free_buffers(page, 0))
+ goto unlock;
+
+ if (page_count(page) != 1)
+ goto unlock;
+
+ __lru_cache_del(page);
+ __remove_inode_page(page);
+ UnlockPage(page);
+ page_cache_release(page);
+ continue;
+unlock:
+ UnlockPage(page);
+ continue;
+}
+
+spin_unlock(&pagecache_lock);
+spin_unlock(&pagemap_lru_lock);
+}
+
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+
+if (page->buffers)
+ block_flushpage(page, partial);
+
+}
+
+static void truncate_complete_page(struct page *page)
+{
+/* Leave it on the LRU if it gets converted into anonymous buffers */
+if (!page->buffers || block_flushpage(page, 0))
+ lru_cache_del(page);
+
+/*
+ * We remove the page from the page cache _after_ we have
+ * destroyed all buffer-cache references to it. Otherwise some
+ * other process might think this inode page is not in the
+ * page cache and creates a buffer-cache alias to it causing
+ * all sorts of fun problems ...
+ */
+ClearPageDirty(page);
+ClearPageUptodate(page);
+remove_inode_page(page);
+page_cache_release(page);
+}
+
+static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
+static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
+{
+struct list_head *curr;
+struct page * page;
+int unlocked = 0;
+
+restart:
+curr = head->prev;
+while (curr != head) {
+ unsigned long offset;
+
+ page = list_entry(curr, struct page, list);
+ offset = page->index;
+
+ /* Is one of the pages to truncate? */
+ if ((offset >= start) || (*partial && (offset + 1) == start)) {
+ int failed;
+
+ page_cache_get(page);
+ failed = TryLockPage(page);
+
+ list_del(head);
+ if (!failed)
+ /* Restart after this page */
+ list_add_tail(head, curr);
+ else
+ /* Restart on this page */
+ list_add(head, curr);
+
+ spin_unlock(&pagecache_lock);
+ conditional_schedule();
+ unlocked = 1;
+
+ if (!failed) {
+ if (*partial && (offset + 1) == start) {
+ truncate_partial_page(page, *partial);
+ *partial = 0;
+ } else
+ truncate_complete_page(page);
+
+ UnlockPage(page);
+ } else
+ wait_on_page(page);
+
+ page_cache_release(page);
+
+ if (current->need_resched) {
+ __set_current_state(TASK_RUNNING);
+ schedule();
+ }
+
+ spin_lock(&pagecache_lock);
+ goto restart;
+ }
+ curr = curr->prev;
+ }
+ return unlocked;
+}
+
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
+ * Truncate the page cache at a set offset, removing the pages
+ * that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
+ */
+void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
+{
+ unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+ int unlocked;
+
+ spin_lock(&pagecache_lock);
+ do {
+ unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
+ unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
+ unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
+ } while (unlocked);
+ /* Traversed all three lists without dropping the lock */
+ spin_unlock(&pagecache_lock);
+}
+
+static inline int invalidate_this_page2(struct page * page,
+ struct list_head * curr,
+ struct list_head * head)
+{
+ int unlocked = 1;
+
+ /*
+ * The page is locked and we hold the pagecache_lock as well
+ * so both page_count(page) and page->buffers stays constant here.
+ */
+ if (page_count(page) == 1 + !!page->buffers) {
+ /* Restart after this page */
+ list_del(head);
+ list_add_tail(head, curr);
+
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ truncate_complete_page(page);
+ } else {
+ if (page->buffers) {
+ /* Restart after this page */
+ list_del(head);
+ list_add_tail(head, curr);
+
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ block_invalidate_page(page);
+ } else
+ unlocked = 0;
+
+ ClearPageDirty(page);
+ ClearPageUptodate(page);
+ }
+
+ return unlocked;
+}
+
+static int FASTCALL(invalidate_list_pages2(struct list_head *));
+static int invalidate_list_pages2(struct list_head *head)
+{
+ struct list_head *curr;
+ struct page * page;
+ int unlocked = 0;
+
+ restart:
+ curr = head->prev;
+ while (curr != head) {
+ page = list_entry(curr, struct page, list);
+
+ if (!TryLockPage(page)) {
+ int __unlocked;
+
+ __unlocked = invalidate_this_page2(page, curr, head);
+ UnlockPage(page);
+ unlocked |= __unlocked;
+ if (!__unlocked) {
+ curr = curr->prev;
+ continue;
+ }
+ } else {
+ /* Restart on this page */
+ list_del(head);
+ list_add(head, curr);
+
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ unlocked = 1;
+ wait_on_page(page);
+ }
+
+ page_cache_release(page);
+ if (current->need_resched) {
+ __set_current_state(TASK_RUNNING);
+ schedule();
+ }
+
+ spin_lock(&pagecache_lock);
+ goto restart;
+ }
+ return unlocked;
+}
+
+/**
+ * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
+ * free the pages because they're mapped.
+ * @mapping: the address_space which pages we want to invalidate
+ */
+void invalidate_inode_pages2(struct address_space * mapping)
+{
+ int unlocked;
+
+ spin_lock(&pagecache_lock);
+ do {
+ unlocked = invalidate_list_pages2(&mapping->clean_pages);
+ unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
+ unlocked |= invalidate_list_pages2(&mapping->locked_pages);
+ } while (unlocked);
+ spin_unlock(&pagecache_lock);
+}
+
+static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
+{
+ goto inside;
+
+ for (;;) {
+ page = page->next_hash;
+inside:
+ if (!page)
+ goto not_found;
+ if (page->mapping != mapping)
+ continue;
+ if (page->index == offset)
+ break;
+ }
+
+not_found:
+ return page;
+}
+
+/*
+ * By the time this is called, the page is locked and
+ * we don't have to worry about any races any more.
+ *
+ * Start the IO..
+ */
+static int writeout_one_page(struct page *page)
+{
+ struct buffer_head *bh, *head = page->buffers;
+
+ bh = head;
+ do {
+ if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
+ continue;
+
+ bh->b_flushtime = jiffies;
+ ll_rw_block(WRITE, 1, &bh);
+ } while ((bh = bh->b_this_page) != head);
+ return 0;
+}
+
+int waitfor_one_page(struct page *page)
+{
+ int error = 0;
+ struct buffer_head *bh, *head = page->buffers;
+
+ bh = head;
+ do {
+ wait_on_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh))
+ error = -EIO;
+ } while ((bh = bh->b_this_page) != head);
+ return error;
+}
+
+static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
+{
+ struct list_head *curr;
+ struct page *page;
+ int retval = 0;
+
+ spin_lock(&pagecache_lock);
+ curr = head->next;
+ while (curr != head) {
+ page = list_entry(curr, struct page, list);
+ curr = curr->next;
+ if (!page->buffers)
+ continue;
+ if (page->index >= end)
+ continue;
+ if (page->index < start)
+ continue;
+
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ conditional_schedule(); /* sys_msync() (only used by minixfs, udf) */
+ lock_page(page);
+
+ /* The buffers could have been free'd while we waited for the page lock */
+ if (page->buffers)
+ retval |= fn(page);
+
+ UnlockPage(page);
+ spin_lock(&pagecache_lock);
+ curr = page->list.next;
+ page_cache_release(page);
+ }
+ spin_unlock(&pagecache_lock);
+
+ return retval;
+}
+
+/*
+ * Two-stage data sync: first start the IO, then go back and
+ * collect the information..
+ */
+int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
+{
+ int retval;
+
+ /* writeout dirty buffers on pages from both clean and dirty lists */
+ retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
+ retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
+ retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
+
+ /* now wait for locked buffers on pages from both clean and dirty lists */
+ retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
+ retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
+ retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
+
+ return retval;
+}
+
+/*
+ * In-memory filesystems have to fail their
+ * writepage function - and this has to be
+ * worked around in the VM layer..
+ *
+ * We
+ * - mark the page dirty again (but do NOT
+ * add it back to the inode dirty list, as
+ * that would livelock in fdatasync)
+ * - activate the page so that the page stealer
+ * doesn't try to write it out over and over
+ * again.
+ */
+int fail_writepage(struct page *page)
+{
+ activate_page(page);
+ SetPageReferenced(page);
+ SetPageDirty(page);
+ UnlockPage(page);
+ return 0;
+}
+
+EXPORT_SYMBOL(fail_writepage);
+
+/**
+ * filemap_fdatasync - walk the list of dirty pages of the given address space
+ * and writepage() all of them.
+ *
+ * @mapping: address space structure to write
+ *
+ */
+void filemap_fdatasync(struct address_space * mapping)
+{
+ int (*writepage)(struct page *) = mapping->a_ops->writepage;
+
+ spin_lock(&pagecache_lock);
+
+ while (!list_empty(&mapping->dirty_pages)) {
+ struct page *page = list_entry(mapping->dirty_pages.next, struct page, list);
+
+ list_del(&page->list);
+ list_add(&page->list, &mapping->locked_pages);
+
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+
+ conditional_schedule(); /* sys_msync() */
+
+ if (!PageDirty(page))
+ goto clean;
+
+ lock_page(page);
+
+ if (PageDirty(page)) {
+ ClearPageDirty(page);
+ writepage(page);
+ } else
+ UnlockPage(page);
+clean:
+ page_cache_release(page);
+ spin_lock(&pagecache_lock);
+ }
+ spin_unlock(&pagecache_lock);
+}
+
+/**
+ * filemap_fdatawait - walk the list of locked pages of the given address space
+ * and wait for all of them.
+ *
+ * @mapping: address space structure to wait for
+ *
+ */
+void filemap_fdatawait(struct address_space * mapping)
+{
+ DEFINE_RESCHED_COUNT;
+restart:
+ spin_lock(&pagecache_lock);
+
+ while (!list_empty(&mapping->locked_pages)) {
+ struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
+
+ list_del(&page->list);
+ list_add(&page->list, &mapping->clean_pages);
+
+ if (TEST_RESCHED_COUNT(32)) {
+ RESET_RESCHED_COUNT();
+ if (conditional_schedule_needed()) {
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ unconditional_schedule();
+ page_cache_release(page);
+ goto restart;
+ }
+ }
+
+ if (!PageLocked(page))
+ continue;
+
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+
+ ___wait_on_page(page);
+
+ page_cache_release(page);
+ spin_lock(&pagecache_lock);
+ }
+ spin_unlock(&pagecache_lock);
+}
+
+/*
+ * Add a page to the inode page cache.
+ *
+ * The caller must have locked the page and
+ * set all the page flags correctly..
+ */
+void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
+{
+ if (!PageLocked(page))
+ BUG();
+
+ page->index = index;
+ page_cache_get(page);
+ spin_lock(&pagecache_lock);
+ add_page_to_inode_queue(mapping, page);
+ add_page_to_hash_queue(page, page_hash(mapping, index));
+ spin_unlock(&pagecache_lock);
+
+ lru_cache_add(page);
+}
+
+/*
+ * This adds a page to the page cache, starting out as locked,
+ * owned by us, but unreferenced, not uptodate and with no errors.
+ */
+static inline void __add_to_page_cache(struct page * page,
+ struct address_space *mapping, unsigned long offset,
+ struct page **hash)
+{
+ unsigned long flags;
+
+ flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
+ page->flags = flags | (1 << PG_locked);
+ page_cache_get(page);
+ page->index = offset;
+ add_page_to_inode_queue(mapping, page);
+ add_page_to_hash_queue(page, hash);
+}
+
+void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
+{
+ spin_lock(&pagecache_lock);
+ __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
+ spin_unlock(&pagecache_lock);
+ lru_cache_add(page);
+}
+
+int add_to_page_cache_unique(struct page * page,
+ struct address_space *mapping, unsigned long offset,
+ struct page **hash)
+{
+ int err;
+ struct page *alias;
+
+ spin_lock(&pagecache_lock);
+ alias = __find_page_nolock(mapping, offset, *hash);
+
+ err = 1;
+ if (!alias) {
+ __add_to_page_cache(page,mapping,offset,hash);
+ err = 0;
+ }
+
+ spin_unlock(&pagecache_lock);
+ if (!err)
+ lru_cache_add(page);
+ return err;
+}
+
+/*
+ * This adds the requested page to the page cache if it isn't already there,
+ * and schedules an I/O to read in its contents from disk.
+ */
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+static int page_cache_read(struct file * file, unsigned long offset)
+{
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ struct page **hash = page_hash(mapping, offset);
+ struct page *page;
+
+ conditional_schedule();
+
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(mapping, offset, *hash);
+ spin_unlock(&pagecache_lock);
+ if (page)
+ return 0;
+
+ page = page_cache_alloc(mapping);
+ if (!page)
+ return -ENOMEM;
+
+ if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
+ int error = mapping->a_ops->readpage(file, page);
+ page_cache_release(page);
+ return error;
+ }
+ /*
+ * We arrive here in the unlikely event that someone
+ * raced with us and added our page to the cache first.
+ */
+ page_cache_release(page);
+ return 0;
+}
+
+/*
+ * Read in an entire cluster at once. A cluster is usually a 64k-
+ * aligned block that includes the page requested in "offset."
+ */
+static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
+ unsigned long filesize));
+static int read_cluster_nonblocking(struct file * file, unsigned long offset,
+ unsigned long filesize)
+{
+ unsigned long pages = CLUSTER_PAGES;
+
+ offset = CLUSTER_OFFSET(offset);
+ while ((pages-- > 0) && (offset < filesize)) {
+ int error = page_cache_read(file, offset);
+ if (error < 0)
+ return error;
+ offset ++;
+ }
+
+ return 0;
+}
+
+/*
+ * Wait for a page to get unlocked.
+ *
+ * This must be called with the caller "holding" the page,
+ * ie with increased "page->count" so that the page won't
+ * go away during the wait..
+ */
+void ___wait_on_page(struct page *page)
+{
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ add_wait_queue(&page->wait, &wait);
+ do {
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (!PageLocked(page))
+ break;
+ sync_page(page);
+ schedule();
+ } while (PageLocked(page));
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(&page->wait, &wait);
+}
+
+void unlock_page(struct page *page)
+{
+ ClearPageLaunder(page);
+ smp_mb__before_clear_bit();
+ if (!test_and_clear_bit(PG_locked, &(page)->flags))
+ BUG();
+ smp_mb__after_clear_bit();
+ if (waitqueue_active(&(page)->wait))
+ wake_up(&(page)->wait);
+}
+
+/*
+ * Get a lock on the page, assuming we need to sleep
+ * to get it..
+ */
+static void __lock_page(struct page *page)
+{
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ add_wait_queue_exclusive(&page->wait, &wait);
+ for (;;) {
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (PageLocked(page)) {
+ sync_page(page);
+ schedule();
+ }
+ if (!TryLockPage(page))
+ break;
+ }
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(&page->wait, &wait);
+}
+
+
+/*
+ * Get an exclusive lock on the page, optimistically
+ * assuming it's not locked..
+ */
+void lock_page(struct page *page)
+{
+ if (TryLockPage(page))
+ __lock_page(page);
+}
+
+/*
+ * a rather lightweight function, finding and getting a reference to a
+ * hashed page atomically.
+ */
+struct page * __find_get_page(struct address_space *mapping,
+ unsigned long offset, struct page **hash)
+{
+ struct page *page;
+
+ /*
+ * We scan the hash list read-only. Addition to and removal from
+ * the hash-list needs a held write-lock.
+ */
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(mapping, offset, *hash);
+ if (page)
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ return page;
+}
+
+/*
+ * Same as above, but trylock it instead of incrementing the count.
+ */
+struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
+{
+ struct page *page;
+ struct page **hash = page_hash(mapping, offset);
+
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(mapping, offset, *hash);
+ if (page) {
+ if (TryLockPage(page))
+ page = NULL;
+ }
+ spin_unlock(&pagecache_lock);
+ return page;
+}
+
+/*
+ * Must be called with the pagecache lock held,
+ * will return with it held (but it may be dropped
+ * during blocking operations..
+ */
+static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
+static struct page * __find_lock_page_helper(struct address_space *mapping,
+ unsigned long offset, struct page *hash)
+{
+ struct page *page;
+
+ /*
+ * We scan the hash list read-only. Addition to and removal from
+ * the hash-list needs a held write-lock.
+ */
+repeat:
+ conditional_schedule(); /* unlink large files */
+ page = __find_page_nolock(mapping, offset, hash);
+ if (page) {
+ page_cache_get(page);
+ if (TryLockPage(page)) {
+ spin_unlock(&pagecache_lock);
+ lock_page(page);
+ spin_lock(&pagecache_lock);
+
+ /* Has the page been re-allocated while we slept? */
+ if (page->mapping != mapping || page->index != offset) {
+ UnlockPage(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ }
+ }
+ return page;
+}
+
+/*
+ * Same as the above, but lock the page too, verifying that
+ * it's still valid once we own it.
+ */
+struct page * __find_lock_page (struct address_space *mapping,
+ unsigned long offset, struct page **hash)
+{
+ struct page *page;
+
+ spin_lock(&pagecache_lock);
+ page = __find_lock_page_helper(mapping, offset, *hash);
+ spin_unlock(&pagecache_lock);
+ return page;
+}
+
+/*
+ * Same as above, but create the page if required..
+ */
+struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
+{
+ struct page *page;
+ struct page **hash = page_hash(mapping, index);
+
+ spin_lock(&pagecache_lock);
+ page = __find_lock_page_helper(mapping, index, *hash);
+ spin_unlock(&pagecache_lock);
+ if (!page) {
+ struct page *newpage = alloc_page(gfp_mask);
+ page = ERR_PTR(-ENOMEM);
+ if (newpage) {
+ spin_lock(&pagecache_lock);
+ page = __find_lock_page_helper(mapping, index, *hash);
+ if (likely(!page)) {
+ page = newpage;
+ __add_to_page_cache(page, mapping, index, hash);
+ newpage = NULL;
+ }
+ spin_unlock(&pagecache_lock);
+ if (newpage == NULL)
+ lru_cache_add(page);
+ else
+ page_cache_release(newpage);
+ }
+ }
+ return page;
+}
+
+/*
+ * Returns locked page at given index in given cache, creating it if needed.
+ */
+struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
+{
+ return find_or_create_page(mapping, index, mapping->gfp_mask);
+}
+
+
+/*
+ * Same as grab_cache_page, but do not wait if the page is unavailable.
+ * This is intended for speculative data generators, where the data can
+ * be regenerated if the page couldn't be grabbed. This routine should
+ * be safe to call while holding the lock for another page.
+ */
+struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
+{
+ struct page *page, **hash;
+
+ hash = page_hash(mapping, index);
+ page = __find_get_page(mapping, index, hash);
+
+ if ( page ) {
+ if ( !TryLockPage(page) ) {
+ /* Page found and locked */
+ /* This test is overly paranoid, but what the heck... */
+ if ( unlikely(page->mapping != mapping || page->index != index) ) {
+ /* Someone reallocated this page under us. */
+ UnlockPage(page);
+ page_cache_release(page);
+ return NULL;
+ } else {
+ return page;
+ }
+ } else {
+ /* Page locked by someone else */
+ page_cache_release(page);
+ return NULL;
+ }
+ }
+
+ page = page_cache_alloc(mapping);
+ if ( unlikely(!page) )
+ return NULL; /* Failed to allocate a page */
+
+ if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
+ /* Someone else grabbed the page already. */
+ page_cache_release(page);
+ return NULL;
+ }
+
+ return page;
+}
+
+#if 0
+#define PROFILE_READAHEAD
+#define DEBUG_READAHEAD
+#endif
+
+/*
+ * Read-ahead profiling information
+ * --------------------------------
+ * Every PROFILE_MAXREADCOUNT, the following information is written
+ * to the syslog:
+ * Percentage of asynchronous read-ahead.
+ * Average of read-ahead fields context value.
+ * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
+ * to the syslog.
+ */
+
+#ifdef PROFILE_READAHEAD
+
+#define PROFILE_MAXREADCOUNT 1000
+
+static unsigned long total_reada;
+static unsigned long total_async;
+static unsigned long total_ramax;
+static unsigned long total_ralen;
+static unsigned long total_rawin;
+
+static void profile_readahead(int async, struct file *filp)
+{
+ unsigned long flags;
+
+ ++total_reada;
+ if (async)
+ ++total_async;
+
+ total_ramax += filp->f_ramax;
+ total_ralen += filp->f_ralen;
+ total_rawin += filp->f_rawin;
+
+ if (total_reada > PROFILE_MAXREADCOUNT) {
+ save_flags(flags);
+ cli();
+ if (!(total_reada > PROFILE_MAXREADCOUNT)) {
+ restore_flags(flags);
+ return;
+ }
+
+ printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
+ total_ramax/total_reada,
+ total_ralen/total_reada,
+ total_rawin/total_reada,
+ (total_async*100)/total_reada);
+#ifdef DEBUG_READAHEAD
+ printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
+ filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
+#endif
+
+ total_reada = 0;
+ total_async = 0;
+ total_ramax = 0;
+ total_ralen = 0;
+ total_rawin = 0;
+
+ restore_flags(flags);
+ }
+}
+#endif /* defined PROFILE_READAHEAD */
+
+/*
+ * Read-ahead context:
+ * -------------------
+ * The read ahead context fields of the "struct file" are the following:
+ * - f_raend : position of the first byte after the last page we tried to
+ * read ahead.
+ * - f_ramax : current read-ahead maximum size.
+ * - f_ralen : length of the current IO read block we tried to read-ahead.
+ * - f_rawin : length of the current read-ahead window.
+ * if last read-ahead was synchronous then
+ * f_rawin = f_ralen
+ * otherwise (was asynchronous)
+ * f_rawin = previous value of f_ralen + f_ralen
+ *
+ * Read-ahead limits:
+ * ------------------
+ * MIN_READAHEAD : minimum read-ahead size when read-ahead.
+ * MAX_READAHEAD : maximum read-ahead size when read-ahead.
+ *
+ * Synchronous read-ahead benefits:
+ * --------------------------------
+ * Using reasonable IO xfer length from peripheral devices increase system
+ * performances.
+ * Reasonable means, in this context, not too large but not too small.
+ * The actual maximum value is:
+ * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
+ * and 32K if defined (4K page size assumed).
+ *
+ * Asynchronous read-ahead benefits:
+ * ---------------------------------
+ * Overlapping next read request and user process execution increase system
+ * performance.
+ *
+ * Read-ahead risks:
+ * -----------------
+ * We have to guess which further data are needed by the user process.
+ * If these data are often not really needed, it's bad for system
+ * performances.
+ * However, we know that files are often accessed sequentially by
+ * application programs and it seems that it is possible to have some good
+ * strategy in that guessing.
+ * We only try to read-ahead files that seems to be read sequentially.
+ *
+ * Asynchronous read-ahead risks:
+ * ------------------------------
+ * In order to maximize overlapping, we must start some asynchronous read
+ * request from the device, as soon as possible.
+ * We must be very careful about:
+ * - The number of effective pending IO read requests.
+ * ONE seems to be the only reasonable value.
+ * - The total memory pool usage for the file access stream.
+ * This maximum memory usage is implicitly 2 IO read chunks:
+ * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
+ * 64k if defined (4K page size assumed).
+ */
+
+static inline int get_max_readahead(struct inode * inode)
+{
+ if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
+ return MAX_READAHEAD;
+ return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
+}
+
+static void generic_file_readahead(int reada_ok,
+ struct file * filp, struct inode * inode,
+ struct page * page)
+{
+ unsigned long end_index;
+ unsigned long index = page->index;
+ unsigned long max_ahead, ahead;
+ unsigned long raend;
+ int max_readahead = get_max_readahead(inode);
+
+ end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+
+ raend = filp->f_raend;
+ max_ahead = 0;
+
+/*
+ * The current page is locked.
+ * If the current position is inside the previous read IO request, do not
+ * try to reread previously read ahead pages.
+ * Otherwise decide or not to read ahead some pages synchronously.
+ * If we are not going to read ahead, set the read ahead context for this
+ * page only.
+ */
+ if (PageLocked(page)) {
+ if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
+ raend = index;
+ if (raend < end_index)
+ max_ahead = filp->f_ramax;
+ filp->f_rawin = 0;
+ filp->f_ralen = 1;
+ if (!max_ahead) {
+ filp->f_raend = index + filp->f_ralen;
+ filp->f_rawin += filp->f_ralen;
+ }
+ }
+ }
+/*
+ * The current page is not locked.
+ * If we were reading ahead and,
+ * if the current max read ahead size is not zero and,
+ * if the current position is inside the last read-ahead IO request,
+ * it is the moment to try to read ahead asynchronously.
+ * We will later force unplug device in order to force asynchronous read IO.
+ */
+ else if (reada_ok && filp->f_ramax && raend >= 1 &&
+ index <= raend && index + filp->f_ralen >= raend) {
+/*
+ * Add ONE page to max_ahead in order to try to have about the same IO max size
+ * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
+ * Compute the position of the last page we have tried to read in order to
+ * begin to read ahead just at the next page.
+ */
+ raend -= 1;
+ if (raend < end_index)
+ max_ahead = filp->f_ramax + 1;
+
+ if (max_ahead) {
+ filp->f_rawin = filp->f_ralen;
+ filp->f_ralen = 0;
+ reada_ok = 2;
+ }
+ }
+/*
+ * Try to read ahead pages.
+ * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
+ * scheduler, will work enough for us to avoid too bad actuals IO requests.
+ */
+ ahead = 0;
+ while (ahead < max_ahead) {
+ ahead ++;
+ if ((raend + ahead) >= end_index)
+ break;
+ if (page_cache_read(filp, raend + ahead) < 0)
+ break;
+ }
+/*
+ * If we tried to read ahead some pages,
+ * If we tried to read ahead asynchronously,
+ * Try to force unplug of the device in order to start an asynchronous
+ * read IO request.
+ * Update the read-ahead context.
+ * Store the length of the current read-ahead window.
+ * Double the current max read ahead size.
+ * That heuristic avoid to do some large IO for files that are not really
+ * accessed sequentially.
+ */
+ if (ahead) {
+ filp->f_ralen += ahead;
+ filp->f_rawin += filp->f_ralen;
+ filp->f_raend = raend + ahead + 1;
+
+ filp->f_ramax += filp->f_ramax;
+
+ if (filp->f_ramax > max_readahead)
+ filp->f_ramax = max_readahead;
+
+#ifdef PROFILE_READAHEAD
+ profile_readahead((reada_ok == 2), filp);
+#endif
+ }
+
+ return;
+}
+
+/*
+ * Mark a page as having seen activity.
+ *
+ * If it was already so marked, move it
+ * to the active queue and drop the referenced
+ * bit. Otherwise, just mark it for future
+ * action..
+ */
+void mark_page_accessed(struct page *page)
+{
+ if (!PageActive(page) && PageReferenced(page)) {
+ activate_page(page);
+ ClearPageReferenced(page);
+ return;
+ }
+
+ /* Mark the page referenced, AFTER checking for previous usage.. */
+ SetPageReferenced(page);
+}
+
+/*
+ * This is a generic file read routine, and uses the
+ * inode->i_op->readpage() function for the actual low-level
+ * stuff.
+ *
+ * This is really ugly. But the goto's actually try to clarify some
+ * of the logic when it comes to error handling etc.
+ */
+void __do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor, int nonblock)
+{
+ struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
+ struct inode *inode = mapping->host;
+ unsigned long index, offset;
+ struct page *cached_page;
+ int reada_ok;
+ int error;
+ int max_readahead = get_max_readahead(inode);
+
+ cached_page = NULL;
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ offset = *ppos & ~PAGE_CACHE_MASK;
+
+/*
+ * If the current position is outside the previous read-ahead window,
+ * we reset the current read-ahead context and set read ahead max to zero
+ * (will be set to just needed value later),
+ * otherwise, we assume that the file accesses are sequential enough to
+ * continue read-ahead.
+ */
+ if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
+ reada_ok = 0;
+ filp->f_raend = 0;
+ filp->f_ralen = 0;
+ filp->f_ramax = 0;
+ filp->f_rawin = 0;
+ } else {
+ reada_ok = 1;
+ }
+/*
+ * Adjust the current value of read-ahead max.
+ * If the read operation stay in the first half page, force no readahead.
+ * Otherwise try to increase read ahead max just enough to do the read request.
+ * Then, at least MIN_READAHEAD if read ahead is ok,
+ * and at most MAX_READAHEAD in all cases.
+ */
+ if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
+ filp->f_ramax = 0;
+ } else {
+ unsigned long needed;
+
+ needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
+
+ if (filp->f_ramax < needed)
+ filp->f_ramax = needed;
+
+ if (reada_ok && filp->f_ramax < MIN_READAHEAD)
+ filp->f_ramax = MIN_READAHEAD;
+ if (filp->f_ramax > max_readahead)
+ filp->f_ramax = max_readahead;
+ }
+
+ for (;;) {
+ struct page *page, **hash;
+ unsigned long end_index, nr, ret;
+
+ end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+
+ if (index > end_index)
+ break;
+ nr = PAGE_CACHE_SIZE;
+ if (index == end_index) {
+ nr = inode->i_size & ~PAGE_CACHE_MASK;
+ if (nr <= offset)
+ break;
+ }
+
+ nr = nr - offset;
+
+ /*
+ * Try to find the data in the page cache..
+ */
+ hash = page_hash(mapping, index);
+
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(mapping, index, *hash);
+ if (!page)
+ goto no_cached_page;
+found_page:
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+
+ conditional_schedule();
+
+ if (!Page_Uptodate(page)) {
+ if (nonblock) {
+ page_cache_release(page);
+ desc->error = -EWOULDBLOCKIO;
+ break;
+ }
+ goto page_not_up_to_date;
+ }
+ if (!nonblock)
+ generic_file_readahead(reada_ok, filp, inode, page);
+page_ok:
+ /* If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping->i_mmap_shared != NULL)
+ flush_dcache_page(page);
+
+ /*
+ * Mark the page accessed if we read the
+ * beginning or we just did an lseek.
+ */
+ if (!offset || !filp->f_reada)
+ mark_page_accessed(page);
+
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ *
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
+ */
+ ret = actor(desc, page, offset, nr);
+ offset += ret;
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+
+ page_cache_release(page);
+
+ conditional_schedule();
+
+ if (ret == nr && desc->count)
+ continue;
+ break;
+
+/*
+ * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
+ */
+page_not_up_to_date:
+ generic_file_readahead(reada_ok, filp, inode, page);
+
+ if (Page_Uptodate(page))
+ goto page_ok;
+
+ /* Get exclusive access to the page ... */
+ lock_page(page);
+
+ /* Did it get unhashed before we got the lock? */
+ if (!page->mapping) {
+ UnlockPage(page);
+ page_cache_release(page);
+ continue;
+ }
+
+ /* Did somebody else fill it already? */
+ if (Page_Uptodate(page)) {
+ UnlockPage(page);
+ goto page_ok;
+ }
+
+readpage:
+ /* ... and start the actual read. The read will unlock the page. */
+ error = mapping->a_ops->readpage(filp, page);
+
+ if (!error) {
+ if (Page_Uptodate(page))
+ goto page_ok;
+
+ /* Again, try some read-ahead while waiting for the page to finish.. */
+ generic_file_readahead(reada_ok, filp, inode, page);
+ wait_on_page(page);
+ if (Page_Uptodate(page))
+ goto page_ok;
+ error = -EIO;
+ }
+
+ /* UHHUH! A synchronous read error occurred. Report it */
+ desc->error = error;
+ page_cache_release(page);
+ break;
+
+no_cached_page:
+ if (nonblock) {
+ spin_unlock(&pagecache_lock);
+ desc->error = -EWOULDBLOCKIO;
+ break;
+ }
+ /*
+ * Ok, it wasn't cached, so we need to create a new
+ * page..
+ *
+ * We get here with the page cache lock held.
+ */
+ if (!cached_page) {
+ spin_unlock(&pagecache_lock);
+ cached_page = page_cache_alloc(mapping);
+ if (!cached_page) {
+ desc->error = -ENOMEM;
+ break;
+ }
+
+ /*
+ * Somebody may have added the page while we
+ * dropped the page cache lock. Check for that.
+ */
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(mapping, index, *hash);
+ if (page)
+ goto found_page;
+ }
+
+ /*
+ * Ok, add the new page to the hash-queues...
+ */
+ page = cached_page;
+ __add_to_page_cache(page, mapping, index, hash);
+ spin_unlock(&pagecache_lock);
+ lru_cache_add(page);
+ cached_page = NULL;
+
+ goto readpage;
+ }
+
+ *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+ filp->f_reada = 1;
+ if (cached_page)
+ page_cache_release(cached_page);
+ UPDATE_ATIME(inode);
+}
+
+static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
+{
+ ssize_t retval;
+ int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
+ struct kiobuf * iobuf;
+ struct inode * inode = filp->f_dentry->d_inode;
+ struct address_space * mapping = inode->i_mapping;
+
+ new_iobuf = 0;
+ iobuf = filp->f_iobuf;
+ if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
+ /*
+ * A parallel read/write is using the preallocated iobuf
+ * so just run slow and allocate a new one.
+ */
+ retval = alloc_kiovec(1, &iobuf);
+ if (retval)
+ goto out;
+ new_iobuf = 1;
+ }
+
+ blocksize = 1 << inode->i_blkbits;
+ blocksize_bits = inode->i_blkbits;
+ blocksize_mask = blocksize - 1;
+ chunk_size = KIO_MAX_ATOMIC_IO << 10;
+
+ retval = -EINVAL;
+ if ((offset & blocksize_mask) || (count & blocksize_mask))
+ goto out_free;
+ if (!mapping->a_ops->direct_IO)
+ goto out_free;
+
+ /*
+ * Flush to disk exlusively the _data_, metadata must remains
+ * completly asynchronous or performance will go to /dev/null.
+ */
+ filemap_fdatasync(mapping);
+ retval = fsync_inode_data_buffers(inode);
+ filemap_fdatawait(mapping);
+ if (retval < 0)
+ goto out_free;
+
+ progress = retval = 0;
+ while (count > 0) {
+ iosize = count;
+ if (iosize > chunk_size)
+ iosize = chunk_size;
+
+ retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
+ if (retval)
+ break;
+
+ retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
+
+ if (rw == READ && retval > 0)
+ mark_dirty_kiobuf(iobuf, retval);
+
+ if (retval >= 0) {
+ count -= retval;
+ buf += retval;
+ progress += retval;
+ }
+
+ unmap_kiobuf(iobuf);
+
+ if (retval != iosize)
+ break;
+ }
+
+ if (progress)
+ retval = progress;
+
+ out_free:
+ if (!new_iobuf)
+ clear_bit(0, &filp->f_iobuf_lock);
+ else
+ free_kiovec(1, &iobuf);
+ out:
+ return retval;
+}
+
+int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
+{
+ char *kaddr;
+ unsigned long left, count = desc->count;
+
+ if (size > count)
+ size = count;
+
+ kaddr = kmap(page);
+ left = __copy_to_user(desc->buf, kaddr + offset, size);
+ kunmap(page);
+
+ if (left) {
+ size -= left;
+ desc->error = -EFAULT;
+ }
+ desc->count = count - size;
+ desc->written += size;
+ desc->buf += size;
+ return size;
+}
+
+/*
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+{
+ ssize_t retval;
+
+ if ((ssize_t) count < 0)
+ return -EINVAL;
+
+ if (filp->f_flags & O_DIRECT)
+ goto o_direct;
+
+ retval = -EFAULT;
+ if (access_ok(VERIFY_WRITE, buf, count)) {
+ retval = 0;
+
+ if (count) {
+ read_descriptor_t desc;
+
+ desc.written = 0;
+ desc.count = count;
+ desc.buf = buf;
+ desc.error = 0;
+ do_generic_file_read(filp, ppos, &desc, file_read_actor);
+
+ retval = desc.written;
+ if (!retval)
+ retval = desc.error;
+ }
+ }
+ out:
+ return retval;
+
+ o_direct:
+ {
+ loff_t pos = *ppos, size;
+ struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
+ struct inode *inode = mapping->host;
+
+ retval = 0;
+ if (!count)
+ goto out; /* skip atime */
+ size = inode->i_size;
+ if (pos < size) {
+ if (pos + count > size)
+ count = size - pos;
+ retval = generic_file_direct_IO(READ, filp, buf, count, pos);
+ if (retval > 0)
+ *ppos = pos + retval;
+ }
+ UPDATE_ATIME(filp->f_dentry->d_inode);
+ goto out;
+ }
+}
+
+static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
+{
+ ssize_t written;
+ unsigned long count = desc->count;
+ struct file *file = (struct file *) desc->buf;
+
+ if (size > count)
+ size = count;
+
+ if (file->f_op->sendpage) {
+ written = file->f_op->sendpage(file, page, offset,
+ size, &file->f_pos, size<count);
+ } else {
+ char *kaddr;
+ mm_segment_t old_fs;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ kaddr = kmap(page);
+ written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
+ kunmap(page);
+
+ set_fs(old_fs);
+ }
+ if (written < 0) {
+ desc->error = written;
+ written = 0;
+ }
+ desc->count = count - written;
+ desc->written += written;
+ return written;
+}
+
+asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+ ssize_t retval;
+ struct file * in_file, * out_file;
+ struct inode * in_inode, * out_inode;
+
+ /*
+ * Get input file, and verify that it is ok..
+ */
+ retval = -EBADF;
+ in_file = fget(in_fd);
+ if (!in_file)
+ goto out;
+ if (!(in_file->f_mode & FMODE_READ))
+ goto fput_in;
+ retval = -EINVAL;
+ in_inode = in_file->f_dentry->d_inode;
+ if (!in_inode)
+ goto fput_in;
+ if (!in_inode->i_mapping->a_ops->readpage)
+ goto fput_in;
+ retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
+ if (retval)
+ goto fput_in;
+
+ /*
+ * Get output file, and verify that it is ok..
+ */
+ retval = -EBADF;
+ out_file = fget(out_fd);
+ if (!out_file)
+ goto fput_in;
+ if (!(out_file->f_mode & FMODE_WRITE))
+ goto fput_out;
+ retval = -EINVAL;
+ if (!out_file->f_op || !out_file->f_op->write)
+ goto fput_out;
+ out_inode = out_file->f_dentry->d_inode;
+ retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
+ if (retval)
+ goto fput_out;
+
+ retval = 0;
+ if (count) {
+ read_descriptor_t desc;
+ loff_t pos = 0, *ppos;
+
+ retval = -EFAULT;
+ ppos = &in_file->f_pos;
+ if (offset) {
+ if (get_user(pos, offset))
+ goto fput_out;
+ ppos = &pos;
+ }
+
+ desc.written = 0;
+ desc.count = count;
+ desc.buf = (char *) out_file;
+ desc.error = 0;
+ do_generic_file_read(in_file, ppos, &desc, file_send_actor);
+
+ retval = desc.written;
+ if (!retval)
+ retval = desc.error;
+ if (offset)
+ put_user(pos, offset);
+ }
+
+fput_out:
+ fput(out_file);
+fput_in:
+ fput(in_file);
+out:
+ return retval;
+}
+
+static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
+{
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ unsigned long max;
+
+ if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+ return -EINVAL;
+
+ /* Limit it to the size of the file.. */
+ max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
+ if (index > max)
+ return 0;
+ max -= index;
+ if (nr > max)
+ nr = max;
+
+ /* And limit it to a sane percentage of the inactive list.. */
+ max = nr_inactive_pages / 2;
+ if (nr > max)
+ nr = max;
+
+ while (nr) {
+ page_cache_read(file, index);
+ index++;
+ nr--;
+ }
+ return 0;
+}
+
+asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
+{
+ ssize_t ret;
+ struct file *file;
+
+ ret = -EBADF;
+ file = fget(fd);
+ if (file) {
+ if (file->f_mode & FMODE_READ) {
+ unsigned long start = offset >> PAGE_CACHE_SHIFT;
+ unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
+ ret = do_readahead(file, start, len);
+ }
+ fput(file);
+ }
+ return ret;
+}
+
+/*
+ * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
+ * sure this is sequential access, we don't need a flexible read-ahead
+ * window size -- we can always use a large fixed size window.
+ */
+static void nopage_sequential_readahead(struct vm_area_struct * vma,
+ unsigned long pgoff, unsigned long filesize)
+{
+ unsigned long ra_window;
+
+ ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
+ ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
+
+ /* vm_raend is zero if we haven't read ahead in this area yet. */
+ if (vma->vm_raend == 0)
+ vma->vm_raend = vma->vm_pgoff + ra_window;
+
+ /*
+ * If we've just faulted the page half-way through our window,
+ * then schedule reads for the next window, and release the
+ * pages in the previous window.
+ */
+ if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
+ unsigned long vm_raend = *(volatile unsigned long *) &vma->vm_raend;
+ unsigned long start = vma->vm_pgoff + vm_raend;
+ unsigned long end = start + ra_window;
+
+ if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
+ end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
+ /*
+ * Sanitize 'start' as well because vm_raend is racy when only
+ * the read sem is acquired like here.
+ */
+ if (start < vma->vm_pgoff)
+ return;
+ if (start > end)
+ return;
+
+ while ((start < end) && (start < filesize)) {
+ if (read_cluster_nonblocking(vma->vm_file,
+ start, filesize) < 0)
+ break;
+ start += CLUSTER_PAGES;
+ }
+ run_task_queue(&tq_disk);
+
+ /* if we're far enough past the beginning of this area,
+ recycle pages that are in the previous window. */
+ if (vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
+ unsigned long window = ra_window << PAGE_SHIFT;
+
+ end = vma->vm_start + (vm_raend << PAGE_SHIFT);
+ end -= window + window;
+ filemap_sync(vma, end - window, window, MS_INVALIDATE);
+ }
+
+ vma->vm_raend += ra_window;
+ }
+
+ return;
+}
+
+/*
+ * filemap_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ */
+struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
+{
+ int error;
+ struct file *file = area->vm_file;
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ struct inode *inode = mapping->host;
+ struct page *page, **hash;
+ unsigned long size, pgoff, endoff;
+
+ pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+ endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+
+retry_all:
+ /*
+ * An external ptracer can access pages that normally aren't
+ * accessible..
+ */
+ size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if ((pgoff >= size) && (area->vm_mm == current->mm))
+ return NULL;
+
+ /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
+ if (size > endoff)
+ size = endoff;
+
+ /*
+ * Do we have something in the page cache already?
+ */
+ hash = page_hash(mapping, pgoff);
+retry_find:
+ page = __find_get_page(mapping, pgoff, hash);
+ if (!page)
+ goto no_cached_page;
+
+ /*
+ * Ok, found a page in the page cache, now we need to check
+ * that it's up-to-date.
+ */
+ if (!Page_Uptodate(page))
+ goto page_not_uptodate;
+
+success:
+ /*
+ * Try read-ahead for sequential areas.
+ */
+ if (VM_SequentialReadHint(area))
+ nopage_sequential_readahead(area, pgoff, size);
+
+ /*
+ * Found the page and have a reference on it, need to check sharing
+ * and possibly copy it over to another page..
+ */
+ activate_page(page);
+ return page;
+
+no_cached_page:
+ /*
+ * If the requested offset is within our file, try to read a whole
+ * cluster of pages at once.
+ *
+ * Otherwise, we're off the end of a privately mapped file,
+ * so we need to map a zero page.
+ */
+ if ((pgoff < size) && !VM_RandomReadHint(area))
+ error = read_cluster_nonblocking(file, pgoff, size);
+ else
+ error = page_cache_read(file, pgoff);
+
+ /*
+ * The page we want has now been added to the page cache.
+ * In the unlikely event that someone removed it in the
+ * meantime, we'll just come back here and read it again.
+ */
+ if (error >= 0)
+ goto retry_find;
+
+ /*
+ * An error return from page_cache_read can result if the
+ * system is low on memory, or a problem occurs while trying
+ * to schedule I/O.
+ */
+ if (error == -ENOMEM)
+ return NOPAGE_OOM;
+ return NULL;
+
+page_not_uptodate:
+ lock_page(page);
+
+ /* Did it get unhashed while we waited for it? */
+ if (!page->mapping) {
+ UnlockPage(page);
+ page_cache_release(page);
+ goto retry_all;
+ }
+
+ /* Did somebody else get it up-to-date? */
+ if (Page_Uptodate(page)) {
+ UnlockPage(page);
+ goto success;
+ }
+
+ if (!mapping->a_ops->readpage(file, page)) {
+ wait_on_page(page);
+ if (Page_Uptodate(page))
+ goto success;
+ }
+
+ /*
+ * Umm, take care of errors if the page isn't up-to-date.
+ * Try to re-read it _once_. We do this synchronously,
+ * because there really aren't any performance issues here
+ * and we need to check for errors.
+ */
+ lock_page(page);
+
+ /* Somebody truncated the page on us? */
+ if (!page->mapping) {
+ UnlockPage(page);
+ page_cache_release(page);
+ goto retry_all;
+ }
+
+ /* Somebody else successfully read it in? */
+ if (Page_Uptodate(page)) {
+ UnlockPage(page);
+ goto success;
+ }
+ ClearPageError(page);
+ if (!mapping->a_ops->readpage(file, page)) {
+ wait_on_page(page);
+ if (Page_Uptodate(page))
+ goto success;
+ }
+
+ /*
+ * Things didn't work out. Return zero to tell the
+ * mm layer so, possibly freeing the page cache page first.
+ */
+ page_cache_release(page);
+ return NULL;
+}
+
+/* Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
+static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ pte_t pte = *ptep;
+
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+ if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
+ flush_tlb_page(vma, address);
+ set_page_dirty(page);
+ }
+ }
+ return 0;
+}
+
+static inline int filemap_sync_pte_range(pmd_t * pmd,
+ unsigned long address, unsigned long size,
+ struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
+{
+ pte_t * pte;
+ unsigned long end;
+ int error;
+
+ if (pmd_none(*pmd))
+ return 0;
+ if (pmd_bad(*pmd)) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ return 0;
+ }
+ pte = pte_offset(pmd, address);
+ offset += address & PMD_MASK;
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+ error = 0;
+ do {
+ error |= filemap_sync_pte(pte, vma, address + offset, flags);
+ address += PAGE_SIZE;
+ pte++;
+ } while (address && (address < end));
+
+ if (conditional_schedule_needed()) {
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ unconditional_schedule(); /* syncing large mapped files */
+ spin_lock(&vma->vm_mm->page_table_lock);
+ }
+ return error;
+}
+
+static inline int filemap_sync_pmd_range(pgd_t * pgd,
+ unsigned long address, unsigned long size,
+ struct vm_area_struct *vma, unsigned int flags)
+{
+ pmd_t * pmd;
+ unsigned long offset, end;
+ int error;
+
+ if (pgd_none(*pgd))
+ return 0;
+ if (pgd_bad(*pgd)) {
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+ return 0;
+ }
+ pmd = pmd_offset(pgd, address);
+ offset = address & PGDIR_MASK;
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ error = 0;
+ do {
+ error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address && (address < end));
+ return error;
+}
+
+int filemap_sync(struct vm_area_struct * vma, unsigned long address,
+ size_t size, unsigned int flags)
+{
+ pgd_t * dir;
+ unsigned long end = address + size;
+ int error = 0;
+
+ /* Aquire the lock early; it may be possible to avoid dropping
+ * and reaquiring it repeatedly.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
+
+ dir = pgd_offset(vma->vm_mm, address);
+ flush_cache_range(vma->vm_mm, end - size, end);
+ if (address >= end)
+ BUG();
+ do {
+ error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ dir++;
+ } while (address && (address < end));
+ flush_tlb_range(vma->vm_mm, end - size, end);
+
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ return error;
+}
+
+static struct vm_operations_struct generic_file_vm_ops = {
+ nopage: filemap_nopage,
+};
+
+/* This is used for a general mmap of a disk file */
+
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ struct inode *inode = mapping->host;
+
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
+ if (!mapping->a_ops->writepage)
+ return -EINVAL;
+ }
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ UPDATE_ATIME(inode);
+ vma->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
+/*
+ * The msync() system call.
+ */
+
+static int msync_interval(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, int flags)
+{
+ struct file * file = vma->vm_file;
+ if (file && (vma->vm_flags & VM_SHARED)) {
+ int error;
+ error = filemap_sync(vma, start, end-start, flags);
+
+ if (!error && (flags & MS_SYNC)) {
+ struct inode * inode = file->f_dentry->d_inode;
+ down(&inode->i_sem);
+ filemap_fdatasync(inode->i_mapping);
+ if (file->f_op && file->f_op->fsync)
+ error = file->f_op->fsync(file, file->f_dentry, 1);
+ filemap_fdatawait(inode->i_mapping);
+ up(&inode->i_sem);
+ }
+ return error;
+ }
+ return 0;
+}
+
+asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
+{
+ unsigned long end;
+ struct vm_area_struct * vma;
+ int unmapped_error, error = -EINVAL;
+
+ down_read(&current->mm->mmap_sem);
+ if (start & ~PAGE_MASK)
+ goto out;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+ if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
+ goto out;
+ error = 0;
+ if (end == start)
+ goto out;
+ /*
+ * If the interval [start,end) covers some unmapped address ranges,
+ * just ignore them, but return -EFAULT at the end.
+ */
+ vma = find_vma(current->mm, start);
+ unmapped_error = 0;
+ for (;;) {
+ /* Still start < end. */
+ error = -EFAULT;
+ if (!vma)
+ goto out;
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ unmapped_error = -EFAULT;
+ start = vma->vm_start;
+ }
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if (end <= vma->vm_end) {
+ if (start < end) {
+ error = msync_interval(vma, start, end, flags);
+ if (error)
+ goto out;
+ }
+ error = unmapped_error;
+ goto out;
+ }
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = msync_interval(vma, start, vma->vm_end, flags);
+ if (error)
+ goto out;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ }
+out:
+ up_read(&current->mm->mmap_sem);
+ return error;
+}
+
+static inline void setup_read_behavior(struct vm_area_struct * vma,
+ int behavior)
+{
+ VM_ClearReadHint(vma);
+ switch(behavior) {
+ case MADV_SEQUENTIAL:
+ vma->vm_flags |= VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ vma->vm_flags |= VM_RAND_READ;
+ break;
+ default:
+ break;
+ }
+ return;
+}
+
+static long madvise_fixup_start(struct vm_area_struct * vma,
+ unsigned long end, int behavior)
+{
+ struct vm_area_struct * n;
+ struct mm_struct * mm = vma->vm_mm;
+
+ n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!n)
+ return -EAGAIN;
+ *n = *vma;
+ n->vm_end = end;
+ setup_read_behavior(n, behavior);
+ n->vm_raend = 0;
+ if (n->vm_file)
+ get_file(n->vm_file);
+ if (n->vm_ops && n->vm_ops->open)
+ n->vm_ops->open(n);
+ vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
+ lock_vma_mappings(vma);
+ spin_lock(&mm->page_table_lock);
+ vma->vm_start = end;
+ __insert_vm_struct(mm, n);
+ spin_unlock(&mm->page_table_lock);
+ unlock_vma_mappings(vma);
+ return 0;
+}
+
+static long madvise_fixup_end(struct vm_area_struct * vma,
+ unsigned long start, int behavior)
+{
+ struct vm_area_struct * n;
+ struct mm_struct * mm = vma->vm_mm;
+
+ n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!n)
+ return -EAGAIN;
+ *n = *vma;
+ n->vm_start = start;
+ n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
+ setup_read_behavior(n, behavior);
+ n->vm_raend = 0;
+ if (n->vm_file)
+ get_file(n->vm_file);
+ if (n->vm_ops && n->vm_ops->open)
+ n->vm_ops->open(n);
+ lock_vma_mappings(vma);
+ spin_lock(&mm->page_table_lock);
+ vma->vm_end = start;
+ __insert_vm_struct(mm, n);
+ spin_unlock(&mm->page_table_lock);
+ unlock_vma_mappings(vma);
+ return 0;
+}
+
+static long madvise_fixup_middle(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, int behavior)
+{
+ struct vm_area_struct * left, * right;
+ struct mm_struct * mm = vma->vm_mm;
+
+ left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!left)
+ return -EAGAIN;
+ right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!right) {
+ kmem_cache_free(vm_area_cachep, left);
+ return -EAGAIN;
+ }
+ *left = *vma;
+ *right = *vma;
+ left->vm_end = start;
+ right->vm_start = end;
+ right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
+ left->vm_raend = 0;
+ right->vm_raend = 0;
+ if (vma->vm_file)
+ atomic_add(2, &vma->vm_file->f_count);
+
+ if (vma->vm_ops && vma->vm_ops->open) {
+ vma->vm_ops->open(left);
+ vma->vm_ops->open(right);
+ }
+ vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
+ vma->vm_raend = 0;
+ lock_vma_mappings(vma);
+ spin_lock(&mm->page_table_lock);
+ vma->vm_start = start;
+ vma->vm_end = end;
+ setup_read_behavior(vma, behavior);
+ __insert_vm_struct(mm, left);
+ __insert_vm_struct(mm, right);
+ spin_unlock(&mm->page_table_lock);
+ unlock_vma_mappings(vma);
+ return 0;
+}
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, int behavior)
+{
+ int error = 0;
+
+ /* This caps the number of vma's this process can own */
+ if (vma->vm_mm->map_count > MAX_MAP_COUNT)
+ return -ENOMEM;
+
+ if (start == vma->vm_start) {
+ if (end == vma->vm_end) {
+ setup_read_behavior(vma, behavior);
+ vma->vm_raend = 0;
+ } else
+ error = madvise_fixup_start(vma, end, behavior);
+ } else {
+ if (end == vma->vm_end)
+ error = madvise_fixup_end(vma, start, behavior);
+ else
+ error = madvise_fixup_middle(vma, start, end, behavior);
+ }
+
+ return error;
+}
+
+/*
+ * Schedule all required I/O operations, then run the disk queue
+ * to make sure they are started. Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ long error = -EBADF;
+ struct file * file;
+ unsigned long size, rlim_rss;
+
+ /* Doesn't work if there's no mapped file. */
+ if (!vma->vm_file)
+ return error;
+ file = vma->vm_file;
+ size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ /* Make sure this doesn't exceed the process's max rss. */
+ error = -EIO;
+ rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
+ LONG_MAX; /* default: see resource.h */
+ if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
+ return error;
+
+ /* round to cluster boundaries if this isn't a "random" area. */
+ if (!VM_RandomReadHint(vma)) {
+ start = CLUSTER_OFFSET(start);
+ end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
+
+ while ((start < end) && (start < size)) {
+ error = read_cluster_nonblocking(file, start, size);
+ start += CLUSTER_PAGES;
+ if (error < 0)
+ break;
+ }
+ } else {
+ while ((start < end) && (start < size)) {
+ error = page_cache_read(file, start);
+ start++;
+ if (error < 0)
+ break;
+ }
+ }
+
+ /* Don't wait for someone else to push these requests. */
+ run_task_queue(&tq_disk);
+
+ return error;
+}
+
+/*
+ * Application no longer needs these pages. If the pages are dirty,
+ * it's OK to just throw them away. The app will be more careful about
+ * data it wants to keep. Be sure to free swap resources too. The
+ * zap_page_range call sets things up for refill_inactive to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * refill_inactive to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do. This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them. There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ if (vma->vm_flags & VM_LOCKED)
+ return -EINVAL;
+
+ zap_page_range(vma->vm_mm, start, end - start,
+ ZPR_FLUSH_CACHE|ZPR_FLUSH_TLB|ZPR_COND_RESCHED); /* sys_madvise(MADV_DONTNEED) */
+
+ return 0;
+}
+
+static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+ unsigned long end, int behavior)
+{
+ long error = -EBADF;
+
+ switch (behavior) {
+ case MADV_NORMAL:
+ case MADV_SEQUENTIAL:
+ case MADV_RANDOM:
+ error = madvise_behavior(vma, start, end, behavior);
+ break;
+
+ case MADV_WILLNEED:
+ error = madvise_willneed(vma, start, end);
+ break;
+
+ case MADV_DONTNEED:
+ error = madvise_dontneed(vma, start, end);
+ break;
+
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area. The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques. The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ * MADV_NORMAL - the default behavior is to read clusters. This
+ * results in some read-ahead and read-behind.
+ * MADV_RANDOM - the system should read the minimum amount of data
+ * on any access, since it is unlikely that the appli-
+ * cation will need more than what it asks for.
+ * MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ * once, so they can be aggressively read ahead, and
+ * can be freed soon after they are accessed.
+ * MADV_WILLNEED - the application is notifying the system to read
+ * some pages ahead.
+ * MADV_DONTNEED - the application is finished with the given range,
+ * so the kernel can free resources associated with it.
+ *
+ * return values:
+ * zero - success
+ * -EINVAL - start + len < 0, start is not page-aligned,
+ * "behavior" is not a valid value, or application
+ * is attempting to release locked or shared pages.
+ * -ENOMEM - addresses in the specified range are not currently
+ * mapped, or are outside the AS of the process.
+ * -EIO - an I/O error occurred while paging in data.
+ * -EBADF - map exists, but area maps something that isn't a file.
+ * -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
+{
+ unsigned long end;
+ struct vm_area_struct * vma;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ down_write(&current->mm->mmap_sem);
+
+ if (start & ~PAGE_MASK)
+ goto out;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ error = 0;
+ if (end == start)
+ goto out;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ */
+ vma = find_vma(current->mm, start);
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if (end <= vma->vm_end) {
+ if (start < end) {
+ error = madvise_vma(vma, start, end,
+ behavior);
+ if (error)
+ goto out;
+ }
+ error = unmapped_error;
+ goto out;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = madvise_vma(vma, start, vma->vm_end, behavior);
+ if (error)
+ goto out;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ }
+
+out:
+ up_write(&current->mm->mmap_sem);
+ return error;
+}
+
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct vm_area_struct * vma,
+ unsigned long pgoff)
+{
+ unsigned char present = 0;
+ struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
+ struct page * page, ** hash = page_hash(as, pgoff);
+
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(as, pgoff, *hash);
+ if ((page) && (Page_Uptodate(page)))
+ present = 1;
+ spin_unlock(&pagecache_lock);
+
+ return present;
+}
+
+static long mincore_vma(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, unsigned char * vec)
+{
+ long error, i, remaining;
+ unsigned char * tmp;
+
+ error = -ENOMEM;
+ if (!vma->vm_file)
+ return error;
+
+ start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ error = -EAGAIN;
+ tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+ if (!tmp)
+ return error;
+
+ /* (end - start) is # of pages, and also # of bytes in "vec */
+ remaining = (end - start),
+
+ error = 0;
+ for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+ int j = 0;
+ long thispiece = (remaining < PAGE_SIZE) ?
+ remaining : PAGE_SIZE;
+
+ while (j < thispiece)
+ tmp[j++] = mincore_page(vma, start++);
+
+ if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+ error = -EFAULT;
+ break;
+ }
+ }
+
+ free_page((unsigned long) tmp);
+ return error;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes. The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information. Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ * zero - success
+ * -EFAULT - vec points to an illegal address
+ * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
+ * or len has a nonpositive value
+ * -ENOMEM - Addresses in the range [addr, addr + len] are
+ * invalid for the address space of this process, or
+ * specify one or more pages which are not currently
+ * mapped
+ * -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_mincore(unsigned long start, size_t len,
+ unsigned char * vec)
+{
+ int index = 0;
+ unsigned long end;
+ struct vm_area_struct * vma;
+ int unmapped_error = 0;
+ long error = -EINVAL;
+
+ down_read(&current->mm->mmap_sem);
+
+ if (start & ~PAGE_CACHE_MASK)
+ goto out;
+ len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ error = 0;
+ if (end == start)
+ goto out;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ */
+ vma = find_vma(current->mm, start);
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if (end <= vma->vm_end) {
+ if (start < end) {
+ error = mincore_vma(vma, start, end,
+ &vec[index]);
+ if (error)
+ goto out;
+ }
+ error = unmapped_error;
+ goto out;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+ if (error)
+ goto out;
+ index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ }
+
+out:
+ up_read(&current->mm->mmap_sem);
+ return error;
+}
+
+static inline
+struct page *__read_cache_page(struct address_space *mapping,
+ unsigned long index,
+ int (*filler)(void *,struct page*),
+ void *data)
+{
+ struct page **hash = page_hash(mapping, index);
+ struct page *page, *cached_page = NULL;
+ int err;
+repeat:
+ page = __find_get_page(mapping, index, hash);
+ if (!page) {
+ if (!cached_page) {
+ cached_page = page_cache_alloc(mapping);
+ if (!cached_page)
+ return ERR_PTR(-ENOMEM);
+ }
+ page = cached_page;
+ if (add_to_page_cache_unique(page, mapping, index, hash))
+ goto repeat;
+ cached_page = NULL;
+ err = filler(data, page);
+ if (err < 0) {
+ page_cache_release(page);
+ page = ERR_PTR(err);
+ }
+ }
+ if (cached_page)
+ page_cache_release(cached_page);
+ return page;
+}
+
+/*
+ * Read into the page cache. If a page already exists,
+ * and Page_Uptodate() is not set, try to fill the page.
+ */
+struct page *read_cache_page(struct address_space *mapping,
+ unsigned long index,
+ int (*filler)(void *,struct page*),
+ void *data)
+{
+ struct page *page;
+ int err;
+
+retry:
+ page = __read_cache_page(mapping, index, filler, data);
+ if (IS_ERR(page))
+ goto out;
+ mark_page_accessed(page);
+ if (Page_Uptodate(page))
+ goto out;
+
+ lock_page(page);
+ if (!page->mapping) {
+ UnlockPage(page);
+ page_cache_release(page);
+ goto retry;
+ }
+ if (Page_Uptodate(page)) {
+ UnlockPage(page);
+ goto out;
+ }
+ err = filler(data, page);
+ if (err < 0) {
+ page_cache_release(page);
+ page = ERR_PTR(err);
+ }
+ out:
+ return page;
+}
+
+static inline struct page * __grab_cache_page(struct address_space *mapping,
+ unsigned long index, struct page **cached_page)
+{
+ struct page *page, **hash = page_hash(mapping, index);
+repeat:
+ page = __find_lock_page(mapping, index, hash);
+ if (!page) {
+ if (!*cached_page) {
+ *cached_page = page_cache_alloc(mapping);
+ if (!*cached_page)
+ return NULL;
+ }
+ page = *cached_page;
+ if (add_to_page_cache_unique(page, mapping, index, hash))
+ goto repeat;
+ *cached_page = NULL;
+ }
+ return page;
+}
+
+inline void remove_suid(struct inode *inode)
+{
+ unsigned int mode;
+
+ /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
+ mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
+
+ /* was any of the uid bits set? */
+ mode &= inode->i_mode;
+ if (mode && !capable(CAP_FSETID)) {
+ inode->i_mode &= ~mode;
+ mark_inode_dirty(inode);
+ }
+}
+
+/*
+ * Write to a file through the page cache.
+ *
+ * We currently put everything into the page cache prior to writing it.
+ * This is not a problem when writing full pages. With partial pages,
+ * however, we first have to read the data into the cache, then
+ * dirty the page, and finally schedule it for writing. Alternatively, we
+ * could write-through just the portion of data that would go into that
+ * page, but that would kill performance for applications that write data
+ * line by line, and it's prone to race conditions.
+ *
+ * Note that this routine doesn't try to keep track of dirty pages. Each
+ * file system has to do this all by itself, unfortunately.
+ * okir@monad.swb.de
+ */
+ssize_t
+generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
+{
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ struct inode *inode = mapping->host;
+ unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+ loff_t pos;
+ struct page *page, *cached_page;
+ unsigned long written;
+ long status = 0;
+ int err;
+ unsigned bytes;
+
+ if ((ssize_t) count < 0)
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_READ, buf, count))
+ return -EFAULT;
+
+ cached_page = NULL;
+
+ down(&inode->i_sem);
+
+ pos = *ppos;
+ err = -EINVAL;
+ if (pos < 0)
+ goto out;
+
+ err = file->f_error;
+ if (err) {
+ file->f_error = 0;
+ goto out;
+ }
+
+ written = 0;
+
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
+ pos = inode->i_size;
+
+ /*
+ * Check whether we've reached the file size limit.
+ */
+ err = -EFBIG;
+
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ goto out;
+ }
+ if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
+ /* send_sig(SIGXFSZ, current, 0); */
+ count = limit - (u32)pos;
+ }
+ }
+
+ /*
+ * LFS rule
+ */
+ if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
+ if (pos >= MAX_NON_LFS) {
+ send_sig(SIGXFSZ, current, 0);
+ goto out;
+ }
+ if (count > MAX_NON_LFS - (u32)pos) {
+ /* send_sig(SIGXFSZ, current, 0); */
+ count = MAX_NON_LFS - (u32)pos;
+ }
+ }
+
+ /*
+ * Are we about to exceed the fs block limit ?
+ *
+ * If we have written data it becomes a short write
+ * If we have exceeded without writing data we send
+ * a signal and give them an EFBIG.
+ *
+ * Linus frestrict idea will clean these up nicely..
+ */
+
+ if (!S_ISBLK(inode->i_mode)) {
+ if (pos >= inode->i_sb->s_maxbytes)
+ {
+ if (count || pos > inode->i_sb->s_maxbytes) {
+ send_sig(SIGXFSZ, current, 0);
+ err = -EFBIG;
+ goto out;
+ }
+ /* zero-length writes at ->s_maxbytes are OK */
+ }
+
+ if (pos + count > inode->i_sb->s_maxbytes)
+ count = inode->i_sb->s_maxbytes - pos;
+ } else {
+ if (is_read_only(inode->i_rdev)) {
+ err = -EPERM;
+ goto out;
+ }
+ if (pos >= inode->i_size) {
+ if (count || pos > inode->i_size) {
+ err = -ENOSPC;
+ goto out;
+ }
+ }
+
+ if (pos + count > inode->i_size)
+ count = inode->i_size - pos;
+ }
+
+ err = 0;
+ if (count == 0)
+ goto out;
+
+ remove_suid(inode);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
+
+ if (file->f_flags & O_DIRECT)
+ goto o_direct;
+
+ do {
+ unsigned long index, offset;
+ long page_fault;
+ char *kaddr;
+
+ /*
+ * Try to find the page in the cache. If it isn't there,
+ * allocate a free page.
+ */
+ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ index = pos >> PAGE_CACHE_SHIFT;
+ bytes = PAGE_CACHE_SIZE - offset;
+ if (bytes > count)
+ bytes = count;
+
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ */
+ { volatile unsigned char dummy;
+ __get_user(dummy, buf);
+ __get_user(dummy, buf+bytes-1);
+ }
+
+ status = -ENOMEM; /* we'll assign it later anyway */
+ page = __grab_cache_page(mapping, index, &cached_page);
+ if (!page)
+ break;
+
+ /* We have exclusive IO access to the page.. */
+ if (!PageLocked(page)) {
+ PAGE_BUG(page);
+ }
+
+ kaddr = kmap(page);
+ status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
+ if (status)
+ goto unlock;
+ page_fault = __copy_from_user(kaddr+offset, buf, bytes);
+ flush_dcache_page(page);
+
+ conditional_schedule();
+
+ status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
+ if (page_fault)
+ goto fail_write;
+ if (!status)
+ status = bytes;
+
+ if (status >= 0) {
+ written += status;
+ count -= status;
+ pos += status;
+ buf += status;
+ }
+unlock:
+ kunmap(page);
+
+ /*
+ * Mark the page accessed if we wrote the
+ * beginning or we just did an lseek.
+ */
+ if (!offset || !file->f_reada)
+ SetPageReferenced(page);
+
+ /* Mark it unlocked again and drop the page.. */
+ UnlockPage(page);
+ page_cache_release(page);
+
+ conditional_schedule();
+
+ if (status < 0)
+ break;
+ } while (count);
+ *ppos = pos;
+
+ if (cached_page)
+ page_cache_release(cached_page);
+
+ /* For now, when the user asks for O_SYNC, we'll actually
+ * provide O_DSYNC. */
+ if ((status >= 0) && (file->f_flags & O_SYNC))
+ status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
+
+out_status:
+ err = written ? written : status;
+out:
+
+ up(&inode->i_sem);
+ return err;
+fail_write:
+ status = -EFAULT;
+ goto unlock;
+
+o_direct:
+ written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
+ if (written > 0) {
+ loff_t end = pos + written;
+ if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
+ inode->i_size = end;
+ mark_inode_dirty(inode);
+ }
+ *ppos = end;
+ invalidate_inode_pages2(mapping);
+ }
+ /*
+ * Sync the fs metadata but not the minor inode changes and
+ * of course not the data as we did direct DMA for the IO.
+ */
+ if (written >= 0 && file->f_flags & O_SYNC)
+ status = generic_osync_inode(inode, OSYNC_METADATA);
+ goto out_status;
+}
+
+void __init page_cache_init(unsigned long mempages)
+{
+ unsigned long htable_size, order;
+
+ htable_size = mempages;
+ htable_size *= sizeof(struct page *);
+ for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
+ ;
+
+ do {
+ unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
+
+ page_hash_bits = 0;
+ while((tmp >>= 1UL) != 0UL)
+ page_hash_bits++;
+
+ page_hash_table = (struct page **)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while(page_hash_table == NULL && --order > 0);
+
+ printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+ (1 << page_hash_bits), order, (PAGE_SIZE << order));
+ if (!page_hash_table)
+ panic("Failed to allocate page hash table\n");
+ memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
+}
diff --git a/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/sched.h b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/sched.h
new file mode 100644
index 000000000000..c2e891695008
--- /dev/null
+++ b/sys-kernel/linux-sources/files/2.4.15pre1aa1-fixes/sched.h
@@ -0,0 +1,944 @@
+#ifndef _LINUX_SCHED_H
+#define _LINUX_SCHED_H
+
+#include <asm/param.h> /* for HZ */
+
+extern unsigned long event;
+
+#include <linux/config.h>
+#include <linux/binfmts.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/times.h>
+#include <linux/timex.h>
+#include <linux/rbtree.h>
+#include <linux/condsched.h>
+
+#include <asm/system.h>
+#include <asm/semaphore.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/mmu.h>
+
+#include <linux/smp.h>
+#include <linux/tty.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/low-latency.h>
+#include <linux/numa_sched.h>
+
+struct exec_domain;
+
+/*
+ * cloning flags:
+ */
+#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
+#define CLONE_VM 0x00000100 /* set if VM shared between processes */
+#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
+#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
+#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
+#define CLONE_PID 0x00001000 /* set if pid shared */
+#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
+#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
+#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
+#define CLONE_THREAD 0x00010000 /* Same thread group? */
+
+#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD)
+
+/*
+ * These are the constant used to fake the fixed-point load-average
+ * counting. Some notes:
+ * - 11 bit fractions expand to 22 bits by the multiplies: this gives
+ * a load-average precision of 10 bits integer + 11 bits fractional
+ * - if you want to count load-averages more often, you need more
+ * precision, or rounding will get you. With 2-second counting freq,
+ * the EXP_n values would be 1981, 2034 and 2043 if still using only
+ * 11 bit fractions.
+ */
+extern unsigned long avenrun[]; /* Load averages */
+
+#define FSHIFT 11 /* nr of bits of precision */
+#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
+#define LOAD_FREQ (5*HZ) /* 5 sec intervals */
+#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
+#define EXP_5 2014 /* 1/exp(5sec/5min) */
+#define EXP_15 2037 /* 1/exp(5sec/15min) */
+
+#define CALC_LOAD(load,exp,n) \
+ load *= exp; \
+ load += n*(FIXED_1-exp); \
+ load >>= FSHIFT;
+
+#define CT_TO_SECS(x) ((x) / HZ)
+#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
+
+extern int nr_running, nr_threads;
+extern int last_pid;
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/param.h>
+#include <linux/resource.h>
+#include <linux/timer.h>
+
+#include <asm/processor.h>
+
+#define TASK_RUNNING 0
+#define TASK_INTERRUPTIBLE 1
+#define TASK_UNINTERRUPTIBLE 2
+#define TASK_ZOMBIE 4
+#define TASK_STOPPED 8
+
+#define __set_task_state(tsk, state_value) \
+ do { (tsk)->state = (state_value); } while (0)
+#ifdef CONFIG_SMP
+#define set_task_state(tsk, state_value) \
+ set_mb((tsk)->state, (state_value))
+#else
+#define set_task_state(tsk, state_value) \
+ __set_task_state((tsk), (state_value))
+#endif
+
+#define __set_current_state(state_value) \
+ do { current->state = (state_value); } while (0)
+#ifdef CONFIG_SMP
+#define set_current_state(state_value) \
+ set_mb(current->state, (state_value))
+#else
+#define set_current_state(state_value) \
+ __set_current_state(state_value)
+#endif
+
+/*
+ * Scheduling policies
+ */
+#define SCHED_OTHER 0
+#define SCHED_FIFO 1
+#define SCHED_RR 2
+
+/*
+ * This is an additional bit set when we want to
+ * yield the CPU for one re-schedule..
+ */
+#define SCHED_YIELD 0x10
+
+struct sched_param {
+ int sched_priority;
+};
+
+struct completion;
+
+#ifdef __KERNEL__
+
+#include <linux/spinlock.h>
+
+/*
+ * This serializes "schedule()" and also protects
+ * the run-queue from deletions/modifications (but
+ * _adding_ to the beginning of the run-queue has
+ * a separate lock).
+ */
+extern rwlock_t tasklist_lock;
+extern spinlock_t runqueue_lock;
+extern spinlock_t mmlist_lock;
+
+extern void sched_init(void);
+extern void init_idle(void);
+extern void show_state(void);
+extern void cpu_init (void);
+extern void trap_init(void);
+extern void update_process_times(int user);
+extern void update_one_process(struct task_struct *p, unsigned long user,
+ unsigned long system, int cpu);
+
+#define MAX_SCHEDULE_TIMEOUT LONG_MAX
+extern signed long FASTCALL(schedule_timeout(signed long timeout));
+asmlinkage void schedule(void);
+
+extern int schedule_task(struct tq_struct *task);
+extern void flush_scheduled_tasks(void);
+extern int start_context_thread(void);
+extern int current_is_keventd(void);
+extern void force_cpu_reschedule(int cpu);
+
+/*
+ * The default fd array needs to be at least BITS_PER_LONG,
+ * as this is the granularity returned by copy_fdset().
+ */
+#define NR_OPEN_DEFAULT BITS_PER_LONG
+
+/*
+ * Open file table structure
+ */
+struct files_struct {
+ atomic_t count;
+ rwlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */
+ int max_fds;
+ int max_fdset;
+ int next_fd;
+ struct file ** fd; /* current fd array */
+ fd_set *close_on_exec;
+ fd_set *open_fds;
+ fd_set close_on_exec_init;
+ fd_set open_fds_init;
+ struct file * fd_array[NR_OPEN_DEFAULT];
+};
+
+#define INIT_FILES \
+{ \
+ count: ATOMIC_INIT(1), \
+ file_lock: RW_LOCK_UNLOCKED, \
+ max_fds: NR_OPEN_DEFAULT, \
+ max_fdset: __FD_SETSIZE, \
+ next_fd: 0, \
+ fd: &init_files.fd_array[0], \
+ close_on_exec: &init_files.close_on_exec_init, \
+ open_fds: &init_files.open_fds_init, \
+ close_on_exec_init: { { 0, } }, \
+ open_fds_init: { { 0, } }, \
+ fd_array: { NULL, } \
+}
+
+/* Maximum number of active map areas.. This is a random (large) number */
+#define MAX_MAP_COUNT (65536)
+
+struct mm_struct {
+ struct vm_area_struct * mmap; /* list of VMAs */
+ rb_root_t mm_rb;
+ struct vm_area_struct * mmap_cache; /* last find_vma result */
+ pgd_t * pgd;
+ atomic_t mm_users; /* How many users with user space? */
+ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
+ int map_count; /* number of VMAs */
+ struct rw_semaphore mmap_sem;
+ spinlock_t page_table_lock; /* Protects task page tables and mm->rss */
+
+ struct list_head mmlist; /* List of all active mm's. These are globally strung
+ * together off init_mm.mmlist, and are protected
+ * by mmlist_lock
+ */
+
+ unsigned long start_code, end_code, start_data, end_data;
+ unsigned long start_brk, brk, start_stack;
+ unsigned long arg_start, arg_end, env_start, env_end;
+ unsigned long rss, total_vm, locked_vm;
+ unsigned long def_flags;
+ unsigned long cpu_vm_mask;
+ unsigned long swap_address;
+
+ unsigned dumpable:1;
+
+ /* Architecture-specific MM context */
+ mm_context_t context;
+};
+
+extern int mmlist_nr;
+
+#define INIT_MM(name) \
+{ \
+ mm_rb: RB_ROOT, \
+ pgd: swapper_pg_dir, \
+ mm_users: ATOMIC_INIT(2), \
+ mm_count: ATOMIC_INIT(1), \
+ mmap_sem: RWSEM_INITIALIZER(name.mmap_sem), \
+ page_table_lock: SPIN_LOCK_UNLOCKED, \
+ mmlist: LIST_HEAD_INIT(name.mmlist), \
+}
+
+struct signal_struct {
+ atomic_t count;
+ struct k_sigaction action[_NSIG];
+ spinlock_t siglock;
+};
+
+
+#define INIT_SIGNALS { \
+ count: ATOMIC_INIT(1), \
+ action: { {{0,}}, }, \
+ siglock: SPIN_LOCK_UNLOCKED \
+}
+
+/*
+ * Some day this will be a full-fledged user tracking system..
+ */
+struct user_struct {
+ atomic_t __count; /* reference count */
+ atomic_t processes; /* How many processes does this user have? */
+ atomic_t files; /* How many open files does this user have? */
+
+ /* Hash table maintenance information */
+ struct user_struct *next, **pprev;
+ uid_t uid;
+};
+
+#define get_current_user() ({ \
+ struct user_struct *__user = current->user; \
+ atomic_inc(&__user->__count); \
+ __user; })
+
+extern struct user_struct root_user;
+#define INIT_USER (&root_user)
+
+struct zone_struct;
+
+struct local_pages {
+ struct list_head list;
+ unsigned int order, nr;
+ struct zone_struct * classzone;
+};
+
+struct task_struct {
+ /*
+ * offsets of these are hardcoded elsewhere - touch with care
+ */
+ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
+ unsigned long flags; /* per process flags, defined below */
+ int sigpending;
+ mm_segment_t addr_limit; /* thread address space:
+ 0-0xBFFFFFFF for user-thead
+ 0-0xFFFFFFFF for kernel-thread
+ */
+ struct exec_domain *exec_domain;
+ volatile long need_resched;
+ unsigned long ptrace;
+
+ int lock_depth; /* Lock depth */
+
+/*
+ * offset 32 begins here on 32-bit platforms. We keep
+ * all fields in a single cacheline that are needed for
+ * the goodness() loop in schedule().
+ */
+ volatile int counter;
+ int nice;
+ unsigned int policy;
+ struct mm_struct *mm;
+ int has_cpu, processor;
+ unsigned long cpus_allowed;
+ /*
+ * (only the 'next' pointer fits into the cacheline, but
+ * that's just fine.)
+ */
+ struct list_head run_list;
+#ifdef CONFIG_NUMA_SCHED
+ int nid;
+#endif
+ int get_child_timeslice;
+ struct task_struct *next_task, *prev_task;
+ struct mm_struct *active_mm;
+ struct rw_sem_recursor mm_recursor;
+ struct local_pages local_pages;
+
+/* task state */
+ struct linux_binfmt *binfmt;
+ int exit_code, exit_signal;
+ int pdeath_signal; /* The signal sent when the parent dies */
+ /* ??? */
+ unsigned long personality;
+ int did_exec:1;
+ pid_t pid;
+ pid_t pgrp;
+ pid_t tty_old_pgrp;
+ pid_t session;
+ pid_t tgid;
+ /* boolean value for session group leader */
+ int leader;
+ /*
+ * pointers to (original) parent process, youngest child, younger sibling,
+ * older sibling, respectively. (p->father can be replaced with
+ * p->p_pptr->pid)
+ */
+ struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
+ struct list_head thread_group;
+
+ /* PID hash table linkage. */
+ struct task_struct *pidhash_next;
+ struct task_struct **pidhash_pprev;
+
+ wait_queue_head_t wait_chldexit; /* for wait4() */
+ struct completion *vfork_done; /* for vfork() */
+ unsigned long rt_priority;
+ unsigned long it_real_value, it_prof_value, it_virt_value;
+ unsigned long it_real_incr, it_prof_incr, it_virt_incr;
+ struct timer_list real_timer;
+ struct tms times;
+ unsigned long start_time;
+ long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
+/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+ unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+ int swappable:1;
+/* process credentials */
+ uid_t uid,euid,suid,fsuid;
+ gid_t gid,egid,sgid,fsgid;
+ int ngroups;
+ gid_t groups[NGROUPS];
+ kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
+ int keep_capabilities:1;
+ struct user_struct *user;
+/* limits */
+ struct rlimit rlim[RLIM_NLIMITS];
+ unsigned short used_math;
+ char comm[16];
+/* file system info */
+ int link_count, total_link_count;
+ struct tty_struct *tty; /* NULL if no tty */
+ unsigned int locks; /* How many file locks are being held */
+/* ipc stuff */
+ struct sem_undo *semundo;
+ struct sem_queue *semsleeping;
+/* CPU-specific state of this task */
+ struct thread_struct thread;
+/* filesystem information */
+ struct fs_struct *fs;
+/* open file information */
+ struct files_struct *files;
+/* signal handlers */
+ spinlock_t sigmask_lock; /* Protects signal and blocked */
+ struct signal_struct *sig;
+
+ sigset_t blocked;
+ struct sigpending pending;
+
+ unsigned long sas_ss_sp;
+ size_t sas_ss_size;
+ int (*notifier)(void *priv);
+ void *notifier_data;
+ sigset_t *notifier_mask;
+
+ /* TUX state */
+ void *tux_info;
+ void (*tux_exit)(void);
+
+/* Thread group tracking */
+ u32 parent_exec_id;
+ u32 self_exec_id;
+/* Protection of (de-)allocation: mm, files, fs, tty */
+ spinlock_t alloc_lock;
+};
+
+/*
+ * Per process flags
+ */
+#define PF_EXITING (1UL<<0) /* getting shut down */
+#define PF_FORKNOEXEC (1UL<<1) /* forked but didn't exec */
+#define PF_SUPERPRIV (1UL<<2) /* used super-user privileges */
+#define PF_DUMPCORE (1UL<<3) /* dumped core */
+#define PF_SIGNALED (1UL<<4) /* killed by a signal */
+#define PF_MEMALLOC (1UL<<5) /* Allocating memory */
+#define PF_USEDFPU (1UL<<6) /* task used FPU this quantum (SMP) */
+#define PF_ATOMICALLOC (1UL<<7) /* do not block during memalloc */
+#define PF_FREE_PAGES (1UL<<8) /* per process page freeing */
+
+
+/*
+ * Ptrace flags
+ */
+
+#define PT_PTRACED 0x00000001
+#define PT_TRACESYS 0x00000002
+#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */
+#define PT_TRACESYSGOOD 0x00000008
+#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */
+
+/*
+ * Limit the stack by to some sane default: root can always
+ * increase this limit if needed.. 8MB seems reasonable.
+ */
+#define _STK_LIM (8*1024*1024)
+
+#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */
+#define MAX_COUNTER (20*HZ/100)
+#define DEF_NICE (0)
+
+
+/*
+ * The default (Linux) execution domain.
+ */
+extern struct exec_domain default_exec_domain;
+
+/*
+ * INIT_TASK is used to set up the first task table, touch at
+ * your own risk!. Base=0, limit=0x1fffff (=2MB)
+ */
+#define INIT_TASK(tsk) \
+{ \
+ state: 0, \
+ flags: 0, \
+ sigpending: 0, \
+ addr_limit: KERNEL_DS, \
+ exec_domain: &default_exec_domain, \
+ lock_depth: -1, \
+ counter: DEF_COUNTER, \
+ nice: DEF_NICE, \
+ policy: SCHED_OTHER, \
+ mm: NULL, \
+ active_mm: &init_mm, \
+ mm_recursor: RWSEM_RECURSOR_INITIALIZER, \
+ cpus_allowed: -1UL, \
+ run_list: LIST_HEAD_INIT(tsk.run_list), \
+ next_task: &tsk, \
+ prev_task: &tsk, \
+ p_opptr: &tsk, \
+ p_pptr: &tsk, \
+ thread_group: LIST_HEAD_INIT(tsk.thread_group), \
+ wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
+ real_timer: { \
+ function: it_real_fn \
+ }, \
+ cap_effective: CAP_INIT_EFF_SET, \
+ cap_inheritable: CAP_INIT_INH_SET, \
+ cap_permitted: CAP_FULL_SET, \
+ keep_capabilities: 0, \
+ rlim: INIT_RLIMITS, \
+ user: INIT_USER, \
+ comm: "swapper", \
+ thread: INIT_THREAD, \
+ fs: &init_fs, \
+ files: &init_files, \
+ sigmask_lock: SPIN_LOCK_UNLOCKED, \
+ sig: &init_signals, \
+ pending: { NULL, &tsk.pending.head, {{0}}}, \
+ blocked: {{0}}, \
+ alloc_lock: SPIN_LOCK_UNLOCKED \
+}
+
+
+#ifndef INIT_TASK_SIZE
+# define INIT_TASK_SIZE 2048*sizeof(long)
+#endif
+
+union task_union {
+ struct task_struct task;
+ unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
+};
+
+extern union task_union init_task_union;
+
+extern struct mm_struct init_mm;
+extern struct task_struct *init_tasks[NR_CPUS];
+
+/* PID hashing. (shouldnt this be dynamic?) */
+#define PIDHASH_SZ (4096 >> 2)
+extern struct task_struct *pidhash[PIDHASH_SZ];
+
+#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
+
+static inline void hash_pid(struct task_struct *p)
+{
+ struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
+
+ if((p->pidhash_next = *htable) != NULL)
+ (*htable)->pidhash_pprev = &p->pidhash_next;
+ *htable = p;
+ p->pidhash_pprev = htable;
+}
+
+static inline void unhash_pid(struct task_struct *p)
+{
+ if(p->pidhash_next)
+ p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
+ *p->pidhash_pprev = p->pidhash_next;
+}
+
+static inline struct task_struct *find_task_by_pid(int pid)
+{
+ struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
+
+ for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
+ ;
+
+ return p;
+}
+
+/* per-UID process charging. */
+extern struct user_struct * alloc_uid(uid_t);
+extern void free_uid(struct user_struct *);
+
+#include <asm/current.h>
+
+extern unsigned long volatile jiffies;
+extern unsigned long itimer_ticks;
+extern unsigned long itimer_next;
+extern volatile struct timeval xtime;
+extern void do_timer(struct pt_regs *);
+
+extern unsigned int * prof_buffer;
+extern unsigned long prof_len;
+extern unsigned long prof_shift;
+
+#define CURRENT_TIME (xtime.tv_sec)
+
+extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr));
+extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
+extern void FASTCALL(sleep_on(wait_queue_head_t *q));
+extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
+ signed long timeout));
+extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
+extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
+ signed long timeout));
+extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+
+#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+#define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
+#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+#define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1)
+#define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr)
+#define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0)
+#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
+#define wake_up_interruptible_sync_nr(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr)
+asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
+
+extern int in_group_p(gid_t);
+extern int in_egroup_p(gid_t);
+
+extern void proc_caches_init(void);
+extern void flush_signals(struct task_struct *);
+extern void flush_signal_handlers(struct task_struct *);
+extern int dequeue_signal(sigset_t *, siginfo_t *);
+extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+ sigset_t *mask);
+extern void unblock_all_signals(void);
+extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+extern int kill_pg_info(int, struct siginfo *, pid_t);
+extern int kill_sl_info(int, struct siginfo *, pid_t);
+extern int kill_proc_info(int, struct siginfo *, pid_t);
+extern void notify_parent(struct task_struct *, int);
+extern void do_notify_parent(struct task_struct *, int);
+extern void force_sig(int, struct task_struct *);
+extern int send_sig(int, struct task_struct *, int);
+extern int kill_pg(pid_t, int, int);
+extern int kill_sl(pid_t, int, int);
+extern int kill_proc(pid_t, int, int);
+extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
+extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
+
+static inline int signal_pending(struct task_struct *p)
+{
+ return (p->sigpending != 0);
+}
+
+/*
+ * Re-calculate pending state from the set of locally pending
+ * signals, globally pending signals, and blocked signals.
+ */
+static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+{
+ unsigned long ready;
+ long i;
+
+ switch (_NSIG_WORDS) {
+ default:
+ for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
+ ready |= signal->sig[i] &~ blocked->sig[i];
+ break;
+
+ case 4: ready = signal->sig[3] &~ blocked->sig[3];
+ ready |= signal->sig[2] &~ blocked->sig[2];
+ ready |= signal->sig[1] &~ blocked->sig[1];
+ ready |= signal->sig[0] &~ blocked->sig[0];
+ break;
+
+ case 2: ready = signal->sig[1] &~ blocked->sig[1];
+ ready |= signal->sig[0] &~ blocked->sig[0];
+ break;
+
+ case 1: ready = signal->sig[0] &~ blocked->sig[0];
+ }
+ return ready != 0;
+}
+
+/* Reevaluate whether the task has signals pending delivery.
+ This is required every time the blocked sigset_t changes.
+ All callers should have t->sigmask_lock. */
+
+static inline void recalc_sigpending(struct task_struct *t)
+{
+ t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+}
+
+/* True if we are on the alternate signal stack. */
+
+static inline int on_sig_stack(unsigned long sp)
+{
+ return (sp - current->sas_ss_sp < current->sas_ss_size);
+}
+
+static inline int sas_ss_flags(unsigned long sp)
+{
+ return (current->sas_ss_size == 0 ? SS_DISABLE
+ : on_sig_stack(sp) ? SS_ONSTACK : 0);
+}
+
+extern int request_irq(unsigned int,
+ void (*handler)(int, void *, struct pt_regs *),
+ unsigned long, const char *, void *);
+extern void free_irq(unsigned int, void *);
+
+/*
+ * This has now become a routine instead of a macro, it sets a flag if
+ * it returns true (to do BSD-style accounting where the process is flagged
+ * if it uses root privs). The implication of this is that you should do
+ * normal permissions checks first, and check suser() last.
+ *
+ * [Dec 1997 -- Chris Evans]
+ * For correctness, the above considerations need to be extended to
+ * fsuser(). This is done, along with moving fsuser() checks to be
+ * last.
+ *
+ * These will be removed, but in the mean time, when the SECURE_NOROOT
+ * flag is set, uids don't grant privilege.
+ */
+static inline int suser(void)
+{
+ if (!issecure(SECURE_NOROOT) && current->euid == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return 1;
+ }
+ return 0;
+}
+
+static inline int fsuser(void)
+{
+ if (!issecure(SECURE_NOROOT) && current->fsuid == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * capable() checks for a particular capability.
+ * New privilege checks should use this interface, rather than suser() or
+ * fsuser(). See include/linux/capability.h for defined capabilities.
+ */
+
+static inline int capable(int cap)
+{
+#if 1 /* ok now */
+ if (cap_raised(current->cap_effective, cap))
+#else
+ if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0)
+#endif
+ {
+ current->flags |= PF_SUPERPRIV;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Routines for handling mm_structs
+ */
+extern struct mm_struct * mm_alloc(void);
+
+extern struct mm_struct * start_lazy_tlb(void);
+extern void end_lazy_tlb(struct mm_struct *mm);
+
+/* mmdrop drops the mm and the page tables */
+extern inline void FASTCALL(__mmdrop(struct mm_struct *));
+static inline void mmdrop(struct mm_struct * mm)
+{
+ if (atomic_dec_and_test(&mm->mm_count))
+ __mmdrop(mm);
+}
+
+/* mmput gets rid of the mappings and all user-space */
+extern void mmput(struct mm_struct *);
+/* Remove the current tasks stale references to the old mm_struct */
+extern void mm_release(void);
+
+/*
+ * Routines for handling the fd arrays
+ */
+extern struct file ** alloc_fd_array(int);
+extern int expand_fd_array(struct files_struct *, int nr);
+extern void free_fd_array(struct file **, int);
+
+extern fd_set *alloc_fdset(int);
+extern int expand_fdset(struct files_struct *, int nr);
+extern void free_fdset(fd_set *, int);
+
+extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern void flush_thread(void);
+extern void exit_thread(void);
+
+extern void exit_mm(struct task_struct *);
+extern void exit_files(struct task_struct *);
+extern void exit_sighand(struct task_struct *);
+
+extern void reparent_to_init(void);
+extern void daemonize(void);
+
+extern int do_execve(char *, char **, char **, struct pt_regs *);
+extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
+
+extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+
+#define nr_running_inc() \
+do { \
+ numa_nr_running_inc(); \
+ nr_running++; \
+} while (0)
+
+#define nr_running_dec() \
+do { \
+ numa_nr_running_dec(); \
+ nr_running--; \
+} while (0)
+
+#define nr_threads_inc() \
+do { \
+ numa_nr_threads_inc(); \
+ nr_threads++; \
+} while (0)
+
+#define nr_threads_dec() \
+do { \
+ numa_nr_threads_dec(); \
+ nr_threads--; \
+} while (0)
+
+#define __wait_event(wq, condition) \
+do { \
+ wait_queue_t __wait; \
+ init_waitqueue_entry(&__wait, current); \
+ \
+ add_wait_queue(&wq, &__wait); \
+ for (;;) { \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ schedule(); \
+ } \
+ current->state = TASK_RUNNING; \
+ remove_wait_queue(&wq, &__wait); \
+} while (0)
+
+#define wait_event(wq, condition) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event(wq, condition); \
+} while (0)
+
+#define __wait_event_interruptible(wq, condition, ret) \
+do { \
+ wait_queue_t __wait; \
+ init_waitqueue_entry(&__wait, current); \
+ \
+ add_wait_queue(&wq, &__wait); \
+ for (;;) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ if (!signal_pending(current)) { \
+ schedule(); \
+ continue; \
+ } \
+ ret = -ERESTARTSYS; \
+ break; \
+ } \
+ current->state = TASK_RUNNING; \
+ remove_wait_queue(&wq, &__wait); \
+} while (0)
+
+#define wait_event_interruptible(wq, condition) \
+({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ __wait_event_interruptible(wq, condition, __ret); \
+ __ret; \
+})
+
+#define REMOVE_LINKS(p) do { \
+ (p)->next_task->prev_task = (p)->prev_task; \
+ (p)->prev_task->next_task = (p)->next_task; \
+ if ((p)->p_osptr) \
+ (p)->p_osptr->p_ysptr = (p)->p_ysptr; \
+ if ((p)->p_ysptr) \
+ (p)->p_ysptr->p_osptr = (p)->p_osptr; \
+ else \
+ (p)->p_pptr->p_cptr = (p)->p_osptr; \
+ } while (0)
+
+#define SET_LINKS(p) do { \
+ (p)->next_task = &init_task; \
+ (p)->prev_task = init_task.prev_task; \
+ init_task.prev_task->next_task = (p); \
+ init_task.prev_task = (p); \
+ (p)->p_ysptr = NULL; \
+ if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \
+ (p)->p_osptr->p_ysptr = p; \
+ (p)->p_pptr->p_cptr = p; \
+ } while (0)
+
+#define for_each_task(p) \
+ for (p = &init_task ; (p = p->next_task) != &init_task ; )
+
+#define next_thread(p) \
+ list_entry((p)->thread_group.next, struct task_struct, thread_group)
+
+#define del_from_runqueue(p) \
+do { \
+ nr_running_dec(); \
+ list_del(&(p)->run_list); \
+ (p)->run_list.next = NULL; \
+} while(0)
+
+static inline int task_on_runqueue(struct task_struct *p)
+{
+ return (p->run_list.next != NULL);
+}
+
+#define unhash_process(p) \
+do { \
+ if (task_on_runqueue(p)) BUG(); \
+ write_lock_irq(&tasklist_lock); \
+ nr_threads_dec(); \
+ unhash_pid(p); \
+ REMOVE_LINKS(p); \
+ list_del(&(p)->thread_group); \
+ write_unlock_irq(&tasklist_lock); \
+} while(0)
+
+/* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */
+static inline void task_lock(struct task_struct *p)
+{
+ spin_lock(&p->alloc_lock);
+}
+
+static inline void task_unlock(struct task_struct *p)
+{
+ spin_unlock(&p->alloc_lock);
+}
+
+/* write full pathname into buffer and return start of pathname */
+static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
+ char *buf, int buflen)
+{
+ char *res;
+ struct vfsmount *rootmnt;
+ struct dentry *root;
+ read_lock(&current->fs->lock);
+ rootmnt = mntget(current->fs->rootmnt);
+ root = dget(current->fs->root);
+ read_unlock(&current->fs->lock);
+ spin_lock(&dcache_lock);
+ res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen);
+ spin_unlock(&dcache_lock);
+ dput(root);
+ mntput(rootmnt);
+ return res;
+}
+
+#endif /* __KERNEL__ */
+
+#endif