Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull namespace updates from Eric Biederman:
"After a lot of discussion and work we have finally reachanged a basic
understanding of what is necessary to make unprivileged mounts safe in
the presence of EVM and IMA xattrs which the last commit in this
series reflects. While technically it is a revert the comments it adds
are important for people not getting confused in the future. Clearing
up that confusion allows us to seriously work on unprivileged mounts
of fuse in the next development cycle.
The rest of the fixes in this set are in the intersection of user
namespaces, ptrace, and exec. I started with the first fix which
started a feedback cycle of finding additional issues during review
and fixing them. Culiminating in a fix for a bug that has been present
since at least Linux v1.0.
Potentially these fixes were candidates for being merged during the rc
cycle, and are certainly backport candidates but enough little things
turned up during review and testing that I decided they should be
handled as part of the normal development process just to be certain
there were not any great surprises when it came time to backport some
of these fixes"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
Revert "evm: Translate user/group ids relative to s_user_ns when computing HMAC"
exec: Ensure mm->user_ns contains the execed files
ptrace: Don't allow accessing an undumpable mm
ptrace: Capture the ptracer's creds not PT_PTRACE_CAP
mm: Add a user_ns owner to mm_struct and fix ptrace permission checks
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 940dfb4..04abdec 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -283,7 +283,7 @@
/* When I and D space are separate, these will need to be fixed. */
case PTRACE_PEEKTEXT: /* read word at location addr. */
case PTRACE_PEEKDATA:
- copied = access_process_vm(child, addr, &tmp, sizeof(tmp),
+ copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp),
FOLL_FORCE);
ret = -EIO;
if (copied != sizeof(tmp))
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index 8d79286..360d996 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -270,7 +270,7 @@
switch (bfin_mem_access_type(addr, to_copy)) {
case BFIN_MEM_ACCESS_CORE:
case BFIN_MEM_ACCESS_CORE_ONLY:
- copied = access_process_vm(child, addr, &tmp,
+ copied = ptrace_access_vm(child, addr, &tmp,
to_copy, FOLL_FORCE);
if (copied)
break;
@@ -323,7 +323,7 @@
switch (bfin_mem_access_type(addr, to_copy)) {
case BFIN_MEM_ACCESS_CORE:
case BFIN_MEM_ACCESS_CORE_ONLY:
- copied = access_process_vm(child, addr, &data,
+ copied = ptrace_access_vm(child, addr, &data,
to_copy,
FOLL_FORCE | FOLL_WRITE);
break;
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index f0df654..fe1f9cf7b 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -147,7 +147,7 @@
/* The trampoline page is globally mapped, no page table to traverse.*/
tmp = *(unsigned long*)addr;
} else {
- copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE);
+ copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE);
if (copied != sizeof(tmp))
break;
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 31aa8c0..36f660d 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1159,7 +1159,7 @@
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
/* read word at location addr */
- if (access_process_vm(child, addr, &data, sizeof(data),
+ if (ptrace_access_vm(child, addr, &data, sizeof(data),
FOLL_FORCE)
!= sizeof(data))
return -EIO;
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 7e71a4e..5fcbdcd 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -69,7 +69,7 @@
if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0)
break;
- copied = access_process_vm(child, (u64)addrOthers, &tmp,
+ copied = ptrace_access_vm(child, (u64)addrOthers, &tmp,
sizeof(tmp), FOLL_FORCE);
if (copied != sizeof(tmp))
break;
@@ -178,7 +178,7 @@
if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0)
break;
ret = 0;
- if (access_process_vm(child, (u64)addrOthers, &data,
+ if (ptrace_access_vm(child, (u64)addrOthers, &data,
sizeof(data),
FOLL_FORCE | FOLL_WRITE) == sizeof(data))
break;
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index 010b7b3..1e887f3 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -73,7 +73,7 @@
if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
break;
- copied = access_process_vm(child, (u64)addrOthers, &tmp,
+ copied = ptrace_access_vm(child, (u64)addrOthers, &tmp,
sizeof(tmp), FOLL_FORCE);
if (copied != sizeof(tmp))
break;
@@ -178,7 +178,7 @@
if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
break;
ret = 0;
- if (access_process_vm(child, (u64)addrOthers, &tmp,
+ if (ptrace_access_vm(child, (u64)addrOthers, &tmp,
sizeof(tmp),
FOLL_FORCE | FOLL_WRITE) == sizeof(tmp))
break;
diff --git a/fs/exec.c b/fs/exec.c
index 923c57d..88b5e1e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1277,8 +1277,22 @@
void would_dump(struct linux_binprm *bprm, struct file *file)
{
- if (inode_permission(file_inode(file), MAY_READ) < 0)
+ struct inode *inode = file_inode(file);
+ if (inode_permission(inode, MAY_READ) < 0) {
+ struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+ /* Ensure mm->user_ns contains the executable */
+ user_ns = old = bprm->mm->user_ns;
+ while ((user_ns != &init_user_ns) &&
+ !privileged_wrt_inode_uidgid(user_ns, inode))
+ user_ns = user_ns->parent;
+
+ if (old != user_ns) {
+ bprm->mm->user_ns = get_user_ns(user_ns);
+ put_user_ns(old);
+ }
+ }
}
EXPORT_SYMBOL(would_dump);
@@ -1308,7 +1322,6 @@
!gid_eq(bprm->cred->gid, current_egid())) {
current->pdeath_signal = 0;
} else {
- would_dump(bprm, bprm->file);
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
set_dumpable(current->mm, suid_dumpable);
}
@@ -1408,7 +1421,7 @@
unsigned n_fs;
if (p->ptrace) {
- if (p->ptrace & PT_PTRACE_CAP)
+ if (ptracer_capable(p, current_user_ns()))
bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
else
bprm->unsafe |= LSM_UNSAFE_PTRACE;
@@ -1743,6 +1756,8 @@
if (retval < 0)
goto out;
+ would_dump(bprm, bprm->file);
+
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
diff --git a/include/linux/capability.h b/include/linux/capability.h
index dbc21c7..6ffb67e 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -240,8 +240,10 @@
return true;
}
#endif /* CONFIG_MULTIUSER */
+extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
+extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
/* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a92c8d7..0b5b2e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1270,6 +1270,8 @@
unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
+extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, unsigned int gup_flags);
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4a8aced..08d947f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -473,6 +473,7 @@
*/
struct task_struct __rcu *owner;
#endif
+ struct user_namespace *user_ns;
/* store ref to file /proc/<pid>/exe symlink points to */
struct file __rcu *exe_file;
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 504c98a..e0e5393 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -8,6 +8,9 @@
#include <linux/pid_namespace.h> /* For task_active_pid_ns. */
#include <uapi/linux/ptrace.h>
+extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags);
+
/*
* Ptrace flags
*
@@ -19,7 +22,6 @@
#define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */
#define PT_PTRACED 0x00000001
#define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */
-#define PT_PTRACE_CAP 0x00000004 /* ptracer can follow suid-exec */
#define PT_OPT_FLAG_SHIFT 3
/* PT_TRACE_* event enable flags */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5ccbbfe..a440cf1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1685,6 +1685,7 @@
struct list_head cpu_timers[3];
/* process credentials */
+ const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
const struct cred __rcu *real_cred; /* objective and real subjective task
* credentials (COW) */
const struct cred __rcu *cred; /* effective (overridable) subjective task
diff --git a/kernel/capability.c b/kernel/capability.c
index 00411c8..4984e1f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -457,6 +457,19 @@
EXPORT_SYMBOL(file_ns_capable);
/**
+ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
+ * @ns: The user namespace in question
+ * @inode: The inode in question
+ *
+ * Return true if the inode uid and gid are within the namespace.
+ */
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+{
+ return kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
+}
+
+/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
@@ -469,7 +482,26 @@
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
- kgid_has_mapping(ns, inode->i_gid);
+ return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
+
+/**
+ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
+ * @tsk: The task that may be ptraced
+ * @ns: The user namespace to search for CAP_SYS_PTRACE in
+ *
+ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
+ * in the specified user namespace.
+ */
+bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
+{
+ int ret = 0; /* An absent tracer adds no restrictions */
+ const struct cred *cred;
+ rcu_read_lock();
+ cred = rcu_dereference(tsk->ptracer_cred);
+ if (cred)
+ ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+ rcu_read_unlock();
+ return (ret == 0);
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index a439ac4..869b8cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -747,7 +747,8 @@
#endif
}
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ struct user_namespace *user_ns)
{
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
@@ -787,6 +788,7 @@
if (init_new_context(p, mm))
goto fail_nocontext;
+ mm->user_ns = get_user_ns(user_ns);
return mm;
fail_nocontext:
@@ -832,7 +834,7 @@
return NULL;
memset(mm, 0, sizeof(*mm));
- return mm_init(mm, current);
+ return mm_init(mm, current, current_user_ns());
}
/*
@@ -847,6 +849,7 @@
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
+ put_user_ns(mm->user_ns);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1128,7 +1131,7 @@
memcpy(mm, oldmm, sizeof(*mm));
- if (!mm_init(mm, tsk))
+ if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
err = dup_mmap(mm, oldmm);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e6474f7..49ba7c1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -27,6 +27,35 @@
#include <linux/cn_proc.h>
#include <linux/compat.h>
+/*
+ * Access another process' address space via ptrace.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ if (!tsk->ptrace ||
+ (current != tsk->parent) ||
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptracer_capable(tsk, mm->user_ns))) {
+ mmput(mm);
+ return 0;
+ }
+
+ ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+ mmput(mm);
+
+ return ret;
+}
+
/*
* ptrace a task: make the debugger its new parent and
@@ -39,6 +68,9 @@
BUG_ON(!list_empty(&child->ptrace_entry));
list_add(&child->ptrace_entry, &new_parent->ptraced);
child->parent = new_parent;
+ rcu_read_lock();
+ child->ptracer_cred = get_cred(__task_cred(new_parent));
+ rcu_read_unlock();
}
/**
@@ -71,12 +103,16 @@
*/
void __ptrace_unlink(struct task_struct *child)
{
+ const struct cred *old_cred;
BUG_ON(!child->ptrace);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
+ old_cred = child->ptracer_cred;
+ child->ptracer_cred = NULL;
+ put_cred(old_cred);
spin_lock(&child->sighand->siglock);
child->ptrace = 0;
@@ -220,7 +256,7 @@
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
- int dumpable = 0;
+ struct mm_struct *mm;
kuid_t caller_uid;
kgid_t caller_gid;
@@ -271,16 +307,11 @@
return -EPERM;
ok:
rcu_read_unlock();
- smp_rmb();
- if (task->mm)
- dumpable = get_dumpable(task->mm);
- rcu_read_lock();
- if (dumpable != SUID_DUMP_USER &&
- !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
- rcu_read_unlock();
- return -EPERM;
- }
- rcu_read_unlock();
+ mm = task->mm;
+ if (mm &&
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptrace_has_cap(mm->user_ns, mode)))
+ return -EPERM;
return security_ptrace_access_check(task, mode);
}
@@ -344,10 +375,6 @@
if (seize)
flags |= PT_SEIZED;
- rcu_read_lock();
- if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
- flags |= PT_PTRACE_CAP;
- rcu_read_unlock();
task->ptrace = flags;
__ptrace_link(task, current);
@@ -537,7 +564,8 @@
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
- retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
+ retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE);
+
if (!retval) {
if (copied)
break;
@@ -564,7 +592,7 @@
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
- retval = access_process_vm(tsk, dst, buf, this_len,
+ retval = ptrace_access_vm(tsk, dst, buf, this_len,
FOLL_FORCE | FOLL_WRITE);
if (!retval) {
if (copied)
@@ -1128,7 +1156,7 @@
unsigned long tmp;
int copied;
- copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
+ copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
if (copied != sizeof(tmp))
return -EIO;
return put_user(tmp, (unsigned long __user *)data);
@@ -1139,7 +1167,7 @@
{
int copied;
- copied = access_process_vm(tsk, addr, &data, sizeof(data),
+ copied = ptrace_access_vm(tsk, addr, &data, sizeof(data),
FOLL_FORCE | FOLL_WRITE);
return (copied == sizeof(data)) ? 0 : -EIO;
}
@@ -1157,7 +1185,7 @@
switch (request) {
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
- ret = access_process_vm(child, addr, &word, sizeof(word),
+ ret = ptrace_access_vm(child, addr, &word, sizeof(word),
FOLL_FORCE);
if (ret != sizeof(word))
ret = -EIO;
@@ -1167,7 +1195,7 @@
case PTRACE_POKETEXT:
case PTRACE_POKEDATA:
- ret = access_process_vm(child, addr, &data, sizeof(data),
+ ret = ptrace_access_vm(child, addr, &data, sizeof(data),
FOLL_FORCE | FOLL_WRITE);
ret = (ret != sizeof(data) ? -EIO : 0);
break;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a56a851..975e49f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -6,6 +6,7 @@
#include <linux/cpumask.h>
#include <linux/atomic.h>
+#include <linux/user_namespace.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
@@ -21,5 +22,6 @@
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ .user_ns = &init_user_ns,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/memory.c b/mm/memory.c
index c264f7c..08d8da3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3904,7 +3904,7 @@
* Access another process' address space as given in mm. If non-NULL, use the
* given task for page fault accounting.
*/
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
diff --git a/mm/nommu.c b/mm/nommu.c
index 9720e0b..27bc543 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1808,7 +1808,7 @@
}
EXPORT_SYMBOL(filemap_map_pages);
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index bf66391..d7f282d 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -151,8 +151,16 @@
memset(&hmac_misc, 0, sizeof(hmac_misc));
hmac_misc.ino = inode->i_ino;
hmac_misc.generation = inode->i_generation;
- hmac_misc.uid = from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
- hmac_misc.gid = from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
+ /* The hmac uid and gid must be encoded in the initial user
+ * namespace (not the filesystems user namespace) as encoding
+ * them in the filesystems user namespace allows an attack
+ * where first they are written in an unprivileged fuse mount
+ * of a filesystem and then the system is tricked to mount the
+ * filesystem for real on next boot and trust it because
+ * everything is signed.
+ */
+ hmac_misc.uid = from_kuid(&init_user_ns, inode->i_uid);
+ hmac_misc.gid = from_kgid(&init_user_ns, inode->i_gid);
hmac_misc.mode = inode->i_mode;
crypto_shash_update(desc, (const u8 *)&hmac_misc, sizeof(hmac_misc));
if (evm_hmac_attrs & EVM_ATTR_FSUUID)