Index: usr.sbin/jls/jls.8 =================================================================== --- usr.sbin/jls/jls.8 (revision 718) +++ usr.sbin/jls/jls.8 (working copy) @@ -42,7 +42,8 @@ .Sh SEE ALSO .Xr jail 2 , .Xr jail 8 , -.Xr jexec 8 +.Xr jexec 8 , +.Xr jtune 8 .Sh HISTORY The .Nm Index: usr.sbin/Makefile =================================================================== --- usr.sbin/Makefile (revision 718) +++ usr.sbin/Makefile (working copy) @@ -73,6 +73,7 @@ jail \ jexec \ jls \ + jtune \ kbdcontrol \ kbdmap \ ${_keyserv} \ Index: usr.sbin/jtune/jtune.c =================================================================== --- usr.sbin/jtune/jtune.c (revision 0) +++ usr.sbin/jtune/jtune.c (revision 0) @@ -0,0 +1,206 @@ +/*- + * Copyright (c) 2006 Chris Jones + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Chris Jones + * thanks to the support of Google's Summer of Code program and + * mentoring by Kip Macy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void usage(void); +static struct xprison *getxprison(int); +extern char **environ; + +int +main(int argc, char **argv) +{ + struct xprison *xp; + int jid = 0; + int memlimit = -1; + int hard_memlimit = -1; + int shares = -1; + int iflag = 0; + int retval; + int ch; + + while ((ch = getopt(argc, argv, "ij:m:M:s:")) != -1) { + switch (ch) { + case 'i': + iflag = 1; + break; + case 'j': + jid = atoi(optarg); + if (!jid && errno) + err(1, "invalid jail id '%s'", optarg); + break; + + case 'm': + memlimit = atoi(optarg); + if (!memlimit && errno) + err(1, "invalid memory limit '%s'", optarg); + if (memlimit < 0) + errx(1, "invalid memory limit '%s'", optarg); + memlimit *= 1024 * 1024; + break; + case 'M': + hard_memlimit = atoi(optarg); + if (!hard_memlimit && errno) + err(1, "invalid memory limit '%s'", optarg); + if (hard_memlimit < 0) + errx(1, "invalid memory limit '%s'", optarg); + hard_memlimit *= 1024 * 1024; + break; + + case 's': + shares = atoi(optarg); + if (!shares && errno) + err(1, "invalid cpu share '%s'", optarg); + if (shares < 0) + errx(1, "invalid cpu share '%s'", optarg); + break; + + default: + usage(); + } + } + + argc -= optind; + argv += optind; + + if (!jid) + usage(); + + xp = getxprison(jid); + if (NULL == xp) + errx(1, "no jail with id %d", jid); + + if (iflag) { + char *hard_memlimstr,*memlimstr, *memusestr, *fullmemusestr; + + asprintf(&memusestr, "%d M", + xp->pr_mem_usage / (1024 * 1024)); + asprintf(&fullmemusestr, "%d M", + xp->pr_full_mem_usage / (1024 * 1024)); + if (xp->pr_mem_limit) { + asprintf(&memlimstr, "%d M", + xp->pr_mem_limit / (1024 * 1024)); + } else { + asprintf(&memlimstr, "None"); + } + if (xp->pr_hard_mem_limit) { + asprintf(&hard_memlimstr, "%d M", + xp->pr_hard_mem_limit / (1024 * 1024)); + } else { + asprintf(&hard_memlimstr, "None"); + } + + if (NULL == memusestr || NULL == fullmemusestr || NULL == memlimstr || NULL == hard_memlimstr) + err(1, "couldn't allocate memory"); + + printf(" JID Hostname Memory Used / Memory Limits CPU Shares\n"); + printf(" RSS | VSS / Soft | Hard\n"); + printf("%6d %-24.24s %5s|%4s / %-6.6s|%-6.6s %-4d\n", + xp->pr_id, xp->pr_host, + memusestr,fullmemusestr, memlimstr,hard_memlimstr, + xp->pr_sched_shares); + exit(0); + } + + retval = jail_set_resource_limits(jid, shares, memlimit, hard_memlimit); + if (retval) { + errx(1, "jail_set_resource_limit(%d, %d, %d, %d) failed", + jid, shares, memlimit, hard_memlimit); + } + exit(0); + +} + +static void +usage() +{ + (void)fprintf(stderr, "%s\n", + "usage: jtune -j jid_id [-m mem_limit] [-M hard_mem_limit] [-s cpu_shares]"); + exit(0); +} + +static struct xprison * +getxprison(int jid) +{ + size_t i, len; + struct xprison *xpl, *sxpl; + if (sysctlbyname("security.jail.list", NULL, &len, NULL, 0) == -1) + err(1, "sysctlbyname(): security.jail.list"); + + if (len <= 0) + errx(1, "sysctl security.jail.list has no entries for jid %d", jid); + + /* getxprison allocates the structure, caller frees */ + sxpl = xpl = malloc(len); + if (NULL == xpl) + err(1, "malloc()"); + + if (sysctlbyname("security.jail.list", xpl, &len, NULL, 0) == -1) { + free(xpl); + err(1, "sysctlbyname(): security.jail.list"); + } + + if (len < sizeof(*xpl) || len % sizeof(*xpl) || + xpl->pr_version != XPRISON_VERSION) + errx(1, "Kernel and userland out of sync"); + + for (i = 0; i < len / sizeof(*xpl); i++) { + if (jid == xpl->pr_id) { + struct xprison *xp; + xp = malloc(sizeof (struct xprison)); + if (NULL == xp) + err(1, "malloc()"); + memcpy(xp, xpl, sizeof (struct xprison)); + free(sxpl); + return xp; + } + xpl++; + } + + free(sxpl); + return NULL; +} Index: usr.sbin/jtune/jtune.8 =================================================================== --- usr.sbin/jtune/jtune.8 (revision 0) +++ usr.sbin/jtune/jtune.8 (revision 0) @@ -0,0 +1,75 @@ +.\" Copyright (c) 2006 Chris Jones +.\" All rights reserved. +.\" +.\" This software was developed for the FreeBSD Project by Chris Jones +.\" thanks to the support of Google's Summer of Code program and +.\" mentoring by Kip Macy. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd August 21, 2006 +.Dt JTUNE 8 +.Os +.Sh NAME +.Nm jtune +.Nd "modify jail resource limits" +.Sh SYNOPSIS +.Nm +.Fl j Ar jail_id +.Op Fl i +.Op Fl m Ar mem_limit +.Op Fl s Ar cpu_shares +.Sh DESCRIPTION +The +.Nm +utility modifies a jail's memory and CPU usage limits. +.Pp +The options are as follows: +.Bl -tag -width ".Fl u Ar cpu_shares" +.It Ar jail_id +Jail identifier (JID) of the jail whose limits are being tuned. +.It Fl i +Show jail's resource limits. +.It Fl m Ar mem_limit +Limit a jail's memory usage (resident set size) to +.Ar mem_limit +megabytes. +.It Fl s Ar cpu_shares +Set a jail's CPU shares to +.Ar cpu_shares +shares. +.Sh SEE ALSO +.Xr jail 2 , +.Xr jail 8 , +.Xr jexec 8 +.Xr jls 8 +.Sh HISTORY +The +.Nm +utility first appeared in +.Fx FIXME . +.Pp +.Nm +was written by Chris Jones through the 2006 Google Summer of Code +program. Index: usr.sbin/jtune/Makefile =================================================================== --- usr.sbin/jtune/Makefile (revision 0) +++ usr.sbin/jtune/Makefile (revision 0) @@ -0,0 +1,10 @@ +# $FreeBSD$ + +PROG= jtune +MAN= jtune.8 +DPADD= ${LIBUTIL} +LDADD= -lutil + +WARNS?= 6 + +.include Index: usr.sbin/jail/jail.c =================================================================== --- usr.sbin/jail/jail.c (revision 718) +++ usr.sbin/jail/jail.c (working copy) @@ -56,6 +56,9 @@ struct in_addr in; gid_t groups[NGROUPS]; int ch, i, iflag, Jflag, lflag, ngroups, securelevel, uflag, Uflag; + unsigned int mem_limit = 0; + unsigned int hard_mem_limit = 0; + unsigned int sched_shares = 0; char path[PATH_MAX], *ep, *username, *JidFile; static char *cleanenv; const char *shell, *p = NULL; @@ -67,15 +70,26 @@ username = JidFile = cleanenv = NULL; fp = NULL; - while ((ch = getopt(argc, argv, "ils:u:U:J:")) != -1) { + while ((ch = getopt(argc, argv, "ilS:H:M:s:u:U:J:")) != -1) { switch (ch) { case 'i': iflag = 1; break; + case 'H': + hard_mem_limit = atoi(optarg); + hard_mem_limit *= 1024 * 1024; + break; case 'J': JidFile = optarg; Jflag = 1; break; + case 'M': + mem_limit = atoi(optarg); + mem_limit *= 1024 * 1024; + break; + case 'S': + sched_shares = atoi(optarg); + break; case 's': ltmp = strtol(optarg, &ep, 0); if (*ep || ep == optarg || ltmp > INT_MAX || !ltmp) @@ -118,6 +132,9 @@ if (inet_aton(argv[2], &in) == 0) errx(1, "Could not make sense of ip-number: %s", argv[2]); j.ip_number = ntohl(in.s_addr); + j.mem_limit = mem_limit; + j.hard_mem_limit = hard_mem_limit; + j.sched_shares = sched_shares; if (Jflag) { fp = fopen(JidFile, "w"); if (fp == NULL) @@ -182,8 +199,10 @@ usage(void) { - (void)fprintf(stderr, "%s%s%s\n", - "usage: jail [-i] [-J jid_file] [-s securelevel] [-l -u ", + (void)fprintf(stderr, "%s%s%s%s%s\n", + "usage: jail [-i] [-J jid_file] [-H hard_mem_limit] [-M mem_limit] ", + "[-S cpu_shares] [-s securelevel]", + " [-l -u ", "username | -U username]", " path hostname ip-number command ..."); exit(1); Index: usr.sbin/jail/jail.8 =================================================================== --- usr.sbin/jail/jail.8 (revision 718) +++ usr.sbin/jail/jail.8 (working copy) @@ -45,6 +45,8 @@ .Op Fl J Ar jid_file .Op Fl s Ar securelevel .Op Fl l u Ar username | Fl U Ar username +.Op Fl S Ar cpu_shares +.Op Fl M Ar mem_limit .Ar path hostname ip-number command ... .Sh DESCRIPTION The @@ -88,6 +90,10 @@ The user name from jailed environment as whom the .Ar command should run. +.It Fl S Ar cpu_shares +CPU shares to assign to the prison. +.It Fl M Ar mem_limit +Amount of memory (in MB) to allow the prison to use. .It Ar path Directory which is to be the root of the prison. .It Ar hostname @@ -550,6 +556,17 @@ This MIB entry determines if a privileged user inside a jail will be able to mount and unmount file system types marked as jail-friendly. The +.It Va security.jail.limit_jail_memory, Va security.jail.jail_pager_interval +These MIB entries determine whether and how often (in seconds) a +jail's memory-limit monitoring daemon will run, and consequently the +period during which a jail can be overcommitted for resident memory. +.It Va kern.sched.limit_jail_cpu +This MIB entry sets whether CPU usage limits will be enforced +against processes in jails with CPU limits. +.It Va kern.sched.system_cpu_shares +Number of CPU usage shares to allocate to unjailed processes for the +purposes of determining CPU usage permitted for jailed processes. +Unjailed processes are not subject to CPU usage limits. .Xr lsvfs 1 command can be used to find file system types available for mount from within a jail. Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map (revision 718) +++ lib/libc/sys/Symbol.map (working copy) @@ -131,6 +131,7 @@ issetugid; jail; jail_attach; + jail_set_resource_limits; kenv; kevent; kill; @@ -601,6 +602,8 @@ __sys_jail; _jail_attach; __sys_jail_attach; + _jail_set_resource_limits; + __sys_jail_set_resource_limits; _kenv; __sys_kenv; _kevent; Index: sys/kern/kern_jail.c =================================================================== --- sys/kern/kern_jail.c (revision 718) +++ sys/kern/kern_jail.c (working copy) @@ -5,8 +5,38 @@ * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- + * + * Portions copyright (c) 2006 Chris Jones, + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Chris Jones + * thanks to the support of Google's Summer of Code program and + * mentoring by Kip Macy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * */ + #include __FBSDID("$FreeBSD$"); @@ -15,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -29,10 +60,17 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include +#include +#include +#include #include #include @@ -78,6 +116,18 @@ &jail_mount_allowed, 0, "Processes in jail can mount/unmount jail-friendly file systems"); +int jail_limit_memory = 0; +SYSCTL_INT(_security_jail, OID_AUTO, limit_jail_memory, CTLFLAG_RW, + &jail_limit_memory, 0, + "Limit jails' memory usage"); + +int jail_memory_pager_interval = 5; +SYSCTL_INT(_security_jail, OID_AUTO, jail_pager_interval, + CTLTYPE_INT | CTLFLAG_RW, + &jail_memory_pager_interval, 0, + "Interval between jail memory limit checks"); + + /* allprison, lastprid, and prisoncount are protected by allprison_lock. */ struct prisonlist allprison; struct sx allprison_lock; @@ -114,6 +164,156 @@ SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL); +static void +jpager_td(void *arg) +{ + struct proc *p, *bigproc; + struct prison *pr = arg; + struct thread *td; + long limit, hard_limit, cursize, newsize, usage, full_usage; + int breakout; + int flags = J_PAGER_TD_ACTIVE; + vm_offset_t bigsize, bigressize,size; + pr->pr_pager_flags_ptr = &flags; + printf ("Starting Prison Pager: %i\n", pr->pr_id); + for (;;) { + bigproc = NULL; + bigsize = 0; + newsize = 0; + cursize=0; + bigressize=0; + if (flags & J_PAGER_TD_DIE) + break; + /* + * Populate the prison struct memory usage fields even if we are not + * restricting it. Useful for getting full jail stats. + */ + prison_memory(pr); + /* + * Copy out the current memory usage from the prison struct. + */ + mtx_lock(&pr->pr_mtx); + usage = pr->pr_mem_usage; + full_usage = pr->pr_full_mem_usage; + mtx_unlock(&pr->pr_mtx); + /* + * Grab the current limits. + */ + limit = prison_memory_limit(pr); + hard_limit = prison_hard_memory_limit(pr); + + if (jail_limit_memory && ( limit > 0 || hard_limit > 0)) { + /* + * Find the largest processes + */ + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + + if (pr != p->p_ucred->cr_prison || !p->p_vmspace) + continue; + + /* + * If we cant lock the process then skip it so we don't deadlock. + */ + if (PROC_TRYLOCK(p) == 0) + continue; + + /* + * Skic system process, exiting processes and protected processes. + * TODO: We should probably ignore init here. + */ + if (p->p_flag & (P_SYSTEM | P_WEXIT | P_PROTECTED)) { + PROC_UNLOCK(p); + continue; + } + + PROC_SLOCK(p); + breakout = 0; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + if (!TD_ON_RUNQ(td) && + !TD_IS_RUNNING(td) && + !TD_IS_SLEEPING(td)) { + thread_unlock(td); + breakout = 1; + break; + } + thread_unlock(td); + } + PROC_SUNLOCK(p); + if (breakout) { + PROC_UNLOCK(p); + continue; + } + + /* + * get the process size + */ + if (!vm_map_trylock_read(&p->p_vmspace->vm_map)) { + PROC_UNLOCK(p); + continue; + } + cursize = size = vmspace_swap_count(p->p_vmspace); + vm_map_unlock_read(&p->p_vmspace->vm_map); + size += vmspace_resident_count(p->p_vmspace); + /* + * if the this process is bigger than the biggest one + * remember it. + */ + if (size > bigsize) { + if (bigproc != NULL) + PROC_UNLOCK(bigproc); + bigproc = p; + bigsize = size; + bigressize = cursize; + } else { + PROC_UNLOCK(p); + } + } /* end LIST_FOREACH procs - Find largest process */ + sx_sunlock(&allproc_lock); + + /* + * If we have a hard limit and we are exceeding it, kill + * the bigest process in jail. + */ + if ( (hard_limit > 0) && (( full_usage - hard_limit) > 0 ) ) { +// printf ("Prison Pager: %i - Hard Memory Limit Exceeded\n", pr->pr_id); + if (bigproc != NULL) { + killproc(bigproc, "Jail Memory Limit Exceeded"); + PROC_SLOCK(bigproc); + sched_nice(bigproc, PRIO_MIN); + PROC_SUNLOCK(bigproc); + PROC_UNLOCK(bigproc); + } + /* + * If we have a soft limit then ask for a reduction in + * the processes resedent memory. + */ + } else if ( (limit > 0) && ((usage - limit) > 0)) { +// printf ("Prison Pager: %i - Soft Memory Limit Exceeded\n", pr->pr_id); + if (bigproc != NULL) { + newsize = bigressize; + newsize -= newsize / 16; + if (bigressize < 0) + newsize = 0; + PROC_UNLOCK(bigproc); + vm_pageout_map_deactivate_pages(&bigproc->p_vmspace->vm_map, newsize); + } + /* + * Unlock big proc, we dont need it any more. + */ + } else { + if (bigproc != NULL) { + PROC_UNLOCK(bigproc); + } + } + } + tsleep(pr, 0, "-", jail_memory_pager_interval * hz); + } + + kthread_exit(0); +} + /* * struct jail_args { * struct jail *jail; @@ -127,6 +327,7 @@ struct prison_service *psrv; struct jail j; struct jail_attach_args jaa; + struct proc *j_pager_proc = NULL; int vfslocked, error, tryprid; error = copyin(uap->jail, &j, sizeof(j)); @@ -135,6 +336,7 @@ if (j.version != 0) return (EINVAL); + MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF); pr->pr_ref = 1; @@ -156,7 +358,10 @@ goto e_dropvnref; pr->pr_ip = j.ip_number; pr->pr_linux = NULL; + pr->pr_sched_shares = j.sched_shares; pr->pr_securelevel = securelevel; + pr->pr_mem_limit = j.mem_limit; + if (prison_service_slots == 0) pr->pr_slots = NULL; else { @@ -169,6 +374,7 @@ tryprid = lastprid + 1; if (tryprid == JAIL_MAX) tryprid = 1; + next: LIST_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_id == tryprid) { @@ -190,6 +396,11 @@ } sx_sunlock(&allprison_lock); + if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id)) + goto e_dropprref; + KASSERT(j_pager_proc != NULL, ("NULL j_pager_proc")); + pr->pr_pager = j_pager_proc; + error = jail_attach(td, &jaa); if (error) goto e_dropprref; @@ -199,6 +410,11 @@ td->td_retval[0] = jaa.jid; return (0); e_dropprref: + if (j_pager_proc != NULL) { + *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; + wakeup(pr); + } + sx_xlock(&allprison_lock); LIST_REMOVE(pr, pr_list); prisoncount--; @@ -269,11 +485,13 @@ newcred = crget(); PROC_LOCK(p); + oldcred = p->p_ucred; setsugid(p); crcopy(newcred, oldcred); newcred->cr_prison = pr; p->p_ucred = newcred; + PROC_UNLOCK(p); crfree(oldcred); return (0); @@ -316,6 +534,9 @@ pr->pr_ref--; if (pr->pr_ref == 0) { mtx_unlock(&pr->pr_mtx); + *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; + wakeup(pr); + TASK_INIT(&pr->pr_task, 0, prison_complete, pr); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); return; @@ -440,6 +661,123 @@ return (ok); } +/* Given credential, return memory usage in bytes. */ +void +prison_memory(struct prison *pr) +{ + struct proc *p; + long mem_used = 0; + long full_mem_used = 0; + long proc_res = 0; + + /* + * TODO: this is a really bad way of doing the + * search, as we end up going across all processes + * for each jail. It'd be more efficient to just do + * this once in a period and update the relevant jail. + * + */ + FOREACH_PROC_IN_SYSTEM(p) { + proc_res=0; + PROC_LOCK(p); + if (!jailed(p->p_ucred) || + (pr != p->p_ucred->cr_prison) || + !p->p_vmspace) { + PROC_UNLOCK(p); + continue; + } + /* + * get the process size + */ + if (!vm_map_trylock_read(&p->p_vmspace->vm_map)) { + PROC_UNLOCK(p); + continue; + } + full_mem_used += vmspace_swap_count(p->p_vmspace); + vm_map_unlock_read(&p->p_vmspace->vm_map); + proc_res = vmspace_resident_count(p->p_vmspace); + full_mem_used += proc_res; + mem_used += proc_res; + PROC_UNLOCK(p); + } + mem_used *= PAGE_SIZE; + full_mem_used *= PAGE_SIZE; + /* Copy the current memory usage to the prison struct */ + mtx_lock(&pr->pr_mtx); + pr->pr_mem_usage = mem_used; + pr->pr_full_mem_usage = full_mem_used; + mtx_unlock(&pr->pr_mtx); +} + +/* Given credential, return permitted memory usage in bytes. */ +long +prison_memory_limit(struct prison *pr) +{ + vm_pindex_t memlimit; + mtx_lock(&pr->pr_mtx); + memlimit = (vm_pindex_t) pr->pr_mem_limit; + mtx_unlock(&pr->pr_mtx); + return memlimit; +} +/* Given credential, return permitted hard memory usage in bytes. */ +long +prison_hard_memory_limit(struct prison *pr) +{ + vm_pindex_t memlimit; + mtx_lock(&pr->pr_mtx); + memlimit = (vm_pindex_t) pr->pr_hard_mem_limit; + mtx_unlock(&pr->pr_mtx); + return memlimit; +} + +/* + * Change resource limit for a prison. + * + * unsigned int jid: id of jail to mess with + * + * int cpushares: 0 -> remove prison from cpu limits + * -1 -> don't change existing shares + * >0 -> set cpu shares + * + * int memlimit: 0 -> remove prison from mem limits + * -1 -> don't change existing limit + * >1 -> set memory limit (bytes) + * + * TODO: might this be better handled via a writable + * sysctl than with a new syscall? + */ +int +jail_set_resource_limits(struct thread *td, struct jail_set_resource_limits_args *uap) +{ + struct prison *pr; + int error; + + error = suser(td); + if (error) + return (error); + + sx_xlock(&allprison_lock); + LIST_FOREACH(pr, &allprison, pr_list) { + if (pr->pr_id == uap->jid) + break; + } + if (NULL == pr) { + sx_unlock(&allprison_lock); + return 1; + } + + mtx_lock(&pr->pr_mtx); + if (-1 != uap->cpushares) + pr->pr_sched_shares = uap->cpushares; + if (-1 != uap->memlimit) + pr->pr_mem_limit = uap->memlimit; + if (-1 != uap->hardmemlimit) + pr->pr_hard_mem_limit = uap->hardmemlimit; + mtx_unlock(&pr->pr_mtx); + sx_unlock(&allprison_lock); + return 0; +} + /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ @@ -965,9 +1303,17 @@ xp->pr_id = pr->pr_id; xp->pr_ip = pr->pr_ip; strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path)); + mtx_lock(&pr->pr_mtx); + xp->pr_sched_shares = pr->pr_sched_shares; + xp->pr_estcpu = pr->pr_estcpu; + xp->pr_mem_limit = pr->pr_mem_limit; + xp->pr_hard_mem_limit = pr->pr_hard_mem_limit; + xp->pr_mem_usage = pr->pr_mem_usage; + xp->pr_full_mem_usage = pr->pr_full_mem_usage; strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host)); mtx_unlock(&pr->pr_mtx); + xp++; } sx_sunlock(&allprison_lock); Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master (revision 718) +++ sys/kern/syscalls.master (working copy) @@ -870,6 +870,8 @@ 488 AUE_NULL STD { int cpuset_setaffinity(cpulevel_t level, \ cpuwhich_t which, id_t id, size_t cpusetsize, \ const cpuset_t *mask); } +489 AUE_NULL STD { int jail_set_resource_limits(unsigned int jid, \ + int cpushares, int memlimit, int hardmemlimit); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c (revision 718) +++ sys/vm/vm_pageout.c (working copy) @@ -211,7 +211,6 @@ CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); #if !defined(NO_SWAPPING) -static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(int req); #endif @@ -597,7 +596,7 @@ * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ -static void +void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; long desired; Index: sys/vm/vm_pageout.h =================================================================== --- sys/vm/vm_pageout.h (revision 718) +++ sys/vm/vm_pageout.h (working copy) @@ -87,6 +87,8 @@ * Exported routines. */ +void vm_pageout_map_deactivate_pages(vm_map_t map, long desired); + /* * Signal pageout-daemon and wait for it. */ Index: sys/sys/jail.h =================================================================== --- sys/sys/jail.h (revision 718) +++ sys/sys/jail.h (working copy) @@ -18,6 +18,9 @@ char *path; char *hostname; u_int32_t ip_number; + unsigned int sched_shares; + unsigned int mem_limit; + unsigned int hard_mem_limit; }; struct xprison { @@ -26,13 +29,26 @@ char pr_path[MAXPATHLEN]; char pr_host[MAXHOSTNAMELEN]; u_int32_t pr_ip; + unsigned int pr_sched_shares; + unsigned int pr_estcpu; + unsigned int pr_mem_limit; + unsigned int pr_hard_mem_limit; + unsigned int pr_mem_usage; + unsigned int pr_full_mem_usage; }; -#define XPRISON_VERSION 1 +#define XPRISON_VERSION 2 +#define JAIL_MINIMUM_SHARES 1 + +#define J_PAGER_TD_ACTIVE 0x01 +#define J_PAGER_TD_DIE 0x02 +#define J_PAGER_TD_DEAD 0x04 + #ifndef _KERNEL int jail(struct jail *); int jail_attach(int); +int jail_set_resource_limits(unsigned int, int, int, int); #else /* _KERNEL */ @@ -73,6 +89,14 @@ int pr_securelevel; /* (p) securelevel */ struct task pr_task; /* (d) destroy task */ struct mtx pr_mtx; + u_int32_t pr_sched_shares; /* (p) jail priority */ + u_int pr_estcpu; /* (p) est. cpu of jail */ + struct proc *pr_pager; /* (c) pager pid */ + int *pr_pager_flags_ptr; /* (p) communication to pager */ + size_t pr_mem_limit; /* (p) memory allocation limit */ + size_t pr_hard_mem_limit; /* (p) hard memory allocation limit */ + size_t pr_mem_usage; /* (p) memory in use */ + size_t pr_full_mem_usage; /* (p) memory in use */ void **pr_slots; /* (p) additional data */ }; #endif /* _KERNEL || _WANT_PRISON */ @@ -113,6 +137,9 @@ void prison_hold(struct prison *pr); int prison_if(struct ucred *cred, struct sockaddr *sa); int prison_ip(struct ucred *cred, int flag, u_int32_t *ip); +void prison_memory(struct prison *pr); +long prison_memory_limit(struct prison *pr); +long prison_hard_memory_limit(struct prison *pr); int prison_priv_check(struct ucred *cred, int priv); void prison_remote_ip(struct ucred *cred, int flags, u_int32_t *ip); Index: sys/sys/syscall.mk =================================================================== --- sys/sys/syscall.mk (revision 718) +++ sys/sys/syscall.mk (working copy) @@ -1,7 +1,7 @@ # FreeBSD system call names. # DO NOT EDIT-- this file is automatically generated. # $FreeBSD$ -# created from FreeBSD: stable/7/sys/kern/syscalls.master 180808 2008-07-25 17:46:01Z jhb +# created from FreeBSD MIASM = \ syscall.o \ exit.o \ @@ -356,4 +356,5 @@ cpuset_setid.o \ cpuset_getid.o \ cpuset_getaffinity.o \ - cpuset_setaffinity.o + cpuset_setaffinity.o \ + jail_set_resource_limits.o