SlideShare uma empresa Scribd logo
1 de 15
Baixar para ler offline
pidfds
Process file descriptors on Linux
Christian Brauner
christian.brauner@ubuntu.com
@brau_ner
https://people.kernel.org/brauner
https://brauner.io
pidfd: what's that?
file descriptor referring to a process
specifically, an fd referring to a thread-group leader
stable, private handle
fd guarantees to reference the same process
pidfds use pre-existing stable process handle
reference struct pid, not task_struct
struct pid
{
refcount_t count;
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[1];
};
Why do this in the first place?
pid recycling
avoid pitfalls of pid recycling on high-pressure systems
CVE-2019-6133: https://bugs.chromium.org/p/project-zero/issues/detail?id=1692
CVE-2014-5033: https://www.cvedetails.com/cve/CVE-2014-5033/
pid-based mac exploits: https://objective-see.com/blog/blog_0x41.html
https://doc.qt.io/qt-5/qprocess.html#startDetached
https://marc.info/?l=openssl-dev&m=130289811108150&w=2
CVE-2017-13209: (Android - Hardware Service Manager Arbitrary Service Replacement due to getpidcon)
https://www.exploit-db.com/exploits/43513
Issue 851: Android: racy getpidcon usage permits binder service replacement
https://bugs.chromium.org/p/project-zero/issues/detail?id=851
Why do this in the first place?
shared libraries
allow to spawn invisible helper processes
process management delegation
hand of a handle to a non-parent process (e.g. for waiting, signaling)
ubiquity of fds
common patterns already exist everywhere in userspace
Does userspace really care about this feature?
dbus
https://gitlab.freedesktop.org/dbus/dbus/issues/274
qt
https://codereview.qt-project.org/c/qt/qtbase/+/108456
systemd
https://github.com/systemd/systemd/issues/13101
criu
https://github.com/checkpoint-restore/criu/issues/717
lmkd
https://android-review.googlesource.com/c/platform/system/core/+/1088157
bpftrace
https://github.com/iovisor/bpftrace/issues/880
mio
https://github.com/samuelbrian/mio-pidfd
Prior art
Illumos
pure userspace emulation of stable process handle
procopen(), procrun(), procclose(), procfree(), etc.
OpenBSD, NetBSD
no private, stable process handles
FreeBSD
procdesc: pdfork(), pdgetpid(), pdkill()
Linux
forkfd(), CLONE_FD
Building a new api
4 kernel releases
individual elements to create a complete api
5.1
sending signals
using pidfds to reliably send signals
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
siginfo_t __user *, info, unsigned int, flags)
{
int ret;
struct fd f;
struct pid *pid;
kernel_siginfo_t kinfo;
/* Enforce flags be set to 0 until we add an extension. */
if (flags)
return -EINVAL;
f = fdget(pidfd);
if (!f.file)
return -EBADF;
/* Is this a pidfd? */
pid = pidfd_to_pid(f.file);
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
goto err;
}
5.2
CLONE_PIDFD
create pidfds at process creation time
O_CLOEXEC
pidfds are close-on-exec by default
/proc/<pid>/fd/fdinfo
contains pid of process in procfs pidns
/*
* This has to happen after we've potentially unshared the file
* descriptor table (so that the pidfd doesn't leak into the child
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
O_RDWR | O_CLOEXEC);
if (IS_ERR(pidfile)) {
put_unused_fd(pidfd);
retval = PTR_ERR(pidfile);
goto bad_fork_free_pid;
}
get_pid(pid); /* held by pidfile now */
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
}
5.3
polling support
exit notification for non-parents
static void do_notify_pidfd(struct task_struct *task)
{
struct pid *pid;
WARN_ON(task->exit_state == 0);
pid = task_pid(task);
wake_up_all(&pid->wait_pidfd);
}
/*
* Poll support for process exit notification.
*/
static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct task_struct *task;
struct pid *pid = file->private_data;
int poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
rcu_read_lock();
task = pid_task(pid, PIDTYPE_PID);
/*
* Inform pollers only when the whole thread group exits.
* If the thread group leader exits before all other threads in the
* group, then poll(2) should block, similar to the wait(2) family.
*/
if (!task || (task->exit_state && thread_group_empty(task)))
poll_flags = POLLIN | POLLRDNORM;
rcu_read_unlock();
return poll_flags;
}
5.3
pidfds without CLONE_PIDFD
pidfd_open() to create pidfd
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
int fd, ret;
struct pid *p;
if (flags)
return -EINVAL;
if (pid <= 0)
return -EINVAL;
p = find_get_pid(pid);
if (!p)
return -ESRCH;
ret = 0;
rcu_read_lock();
if (!pid_task(p, PIDTYPE_TGID))
ret = -EINVAL;
rcu_read_unlock();
fd = ret ?: pidfd_create(p);
put_pid(p);
return fd;
}
5.4 (proposed)
waiting through pidfds
P_PIDFD for waitid()
case P_PIDFD:
type = PIDTYPE_PID;
if (upid < 0)
return -EINVAL;
pid = pidfd_get_pid(upid);
if (IS_ERR(pid))
return PTR_ERR(pid);
break;
default:
return -EINVAL;
}
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options;
wo.wo_info = infop;
wo.wo_rusage = ru;
ret = do_wait(&wo);
Kill-on-close
SIGKILL on last close
kill process when last fd referencing it is closed
exclusive waiting
CLONE_WAIT_PID
hide process from generic wait requests (e.g. waitid(P_ALL)
Lessons learned
speed matters
choose a sustainable speed for developing features
be open about being "dumb"
it's ok to say "I don't know" or "I can't review that"
be resilient
reviews are a form of critique

Mais conteúdo relacionado

Semelhante a Kernel Recipes 2019 - pidfds: Process file descriptors on Linux

Unit 6
Unit 6Unit 6
Unit 6
siddr
 
PyCon 2013 : Scripting to PyPi to GitHub and More
PyCon 2013 : Scripting to PyPi to GitHub and MorePyCon 2013 : Scripting to PyPi to GitHub and More
PyCon 2013 : Scripting to PyPi to GitHub and More
Matt Harrison
 
Unit 5
Unit 5Unit 5
Unit 5
siddr
 

Semelhante a Kernel Recipes 2019 - pidfds: Process file descriptors on Linux (20)

Debugging Hung Python Processes With GDB
Debugging Hung Python Processes With GDBDebugging Hung Python Processes With GDB
Debugging Hung Python Processes With GDB
 
Unit 6
Unit 6Unit 6
Unit 6
 
Lecture2 process structure and programming
Lecture2   process structure and programmingLecture2   process structure and programming
Lecture2 process structure and programming
 
Unix kernal
Unix kernalUnix kernal
Unix kernal
 
Chromium Sandbox on Linux (BlackHoodie 2018)
Chromium Sandbox on Linux (BlackHoodie 2018)Chromium Sandbox on Linux (BlackHoodie 2018)
Chromium Sandbox on Linux (BlackHoodie 2018)
 
Threads Advance in System Administration with Linux
Threads Advance in System Administration with LinuxThreads Advance in System Administration with Linux
Threads Advance in System Administration with Linux
 
How to Design a Great API (using flask) [ploneconf2017]
How to Design a Great API (using flask) [ploneconf2017]How to Design a Great API (using flask) [ploneconf2017]
How to Design a Great API (using flask) [ploneconf2017]
 
Sysdig
SysdigSysdig
Sysdig
 
2015.07.16 Способы диагностики PostgreSQL
2015.07.16 Способы диагностики PostgreSQL2015.07.16 Способы диагностики PostgreSQL
2015.07.16 Способы диагностики PostgreSQL
 
SELinux Kernel Internals and Architecture - FOSS.IN/2005
SELinux Kernel Internals and Architecture - FOSS.IN/2005SELinux Kernel Internals and Architecture - FOSS.IN/2005
SELinux Kernel Internals and Architecture - FOSS.IN/2005
 
Lecture_Slide_4.pptx
Lecture_Slide_4.pptxLecture_Slide_4.pptx
Lecture_Slide_4.pptx
 
Linux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloudLinux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloud
 
SDE TP 4 - Processus
SDE TP 4 - ProcessusSDE TP 4 - Processus
SDE TP 4 - Processus
 
PyCon 2013 : Scripting to PyPi to GitHub and More
PyCon 2013 : Scripting to PyPi to GitHub and MorePyCon 2013 : Scripting to PyPi to GitHub and More
PyCon 2013 : Scripting to PyPi to GitHub and More
 
Container: is it safe enough to run you application?
Container: is it safe enough to run you application?Container: is it safe enough to run you application?
Container: is it safe enough to run you application?
 
Unit 5
Unit 5Unit 5
Unit 5
 
Systems Programming Assignment Help - Processes
Systems Programming Assignment Help - ProcessesSystems Programming Assignment Help - Processes
Systems Programming Assignment Help - Processes
 
리눅스 드라이버 #2
리눅스 드라이버 #2리눅스 드라이버 #2
리눅스 드라이버 #2
 
configuring a warm standby, the easy way
configuring a warm standby, the easy wayconfiguring a warm standby, the easy way
configuring a warm standby, the easy way
 
CH03.pdf
CH03.pdfCH03.pdf
CH03.pdf
 

Mais de Anne Nicolas

Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary dataKernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Anne Nicolas
 

Mais de Anne Nicolas (20)

Kernel Recipes 2019 - Driving the industry toward upstream first
Kernel Recipes 2019 - Driving the industry toward upstream firstKernel Recipes 2019 - Driving the industry toward upstream first
Kernel Recipes 2019 - Driving the industry toward upstream first
 
Kernel Recipes 2019 - No NMI? No Problem! – Implementing Arm64 Pseudo-NMI
Kernel Recipes 2019 - No NMI? No Problem! – Implementing Arm64 Pseudo-NMIKernel Recipes 2019 - No NMI? No Problem! – Implementing Arm64 Pseudo-NMI
Kernel Recipes 2019 - No NMI? No Problem! – Implementing Arm64 Pseudo-NMI
 
Kernel Recipes 2019 - Hunting and fixing bugs all over the Linux kernel
Kernel Recipes 2019 - Hunting and fixing bugs all over the Linux kernelKernel Recipes 2019 - Hunting and fixing bugs all over the Linux kernel
Kernel Recipes 2019 - Hunting and fixing bugs all over the Linux kernel
 
Kernel Recipes 2019 - Metrics are money
Kernel Recipes 2019 - Metrics are moneyKernel Recipes 2019 - Metrics are money
Kernel Recipes 2019 - Metrics are money
 
Kernel Recipes 2019 - Kernel documentation: past, present, and future
Kernel Recipes 2019 - Kernel documentation: past, present, and futureKernel Recipes 2019 - Kernel documentation: past, present, and future
Kernel Recipes 2019 - Kernel documentation: past, present, and future
 
Embedded Recipes 2019 - Knowing your ARM from your ARSE: wading through the t...
Embedded Recipes 2019 - Knowing your ARM from your ARSE: wading through the t...Embedded Recipes 2019 - Knowing your ARM from your ARSE: wading through the t...
Embedded Recipes 2019 - Knowing your ARM from your ARSE: wading through the t...
 
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary dataKernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
Kernel Recipes 2019 - GNU poke, an extensible editor for structured binary data
 
Kernel Recipes 2019 - Analyzing changes to the binary interface exposed by th...
Kernel Recipes 2019 - Analyzing changes to the binary interface exposed by th...Kernel Recipes 2019 - Analyzing changes to the binary interface exposed by th...
Kernel Recipes 2019 - Analyzing changes to the binary interface exposed by th...
 
Embedded Recipes 2019 - Remote update adventures with RAUC, Yocto and Barebox
Embedded Recipes 2019 - Remote update adventures with RAUC, Yocto and BareboxEmbedded Recipes 2019 - Remote update adventures with RAUC, Yocto and Barebox
Embedded Recipes 2019 - Remote update adventures with RAUC, Yocto and Barebox
 
Embedded Recipes 2019 - Making embedded graphics less special
Embedded Recipes 2019 - Making embedded graphics less specialEmbedded Recipes 2019 - Making embedded graphics less special
Embedded Recipes 2019 - Making embedded graphics less special
 
Embedded Recipes 2019 - Linux on Open Source Hardware and Libre Silicon
Embedded Recipes 2019 - Linux on Open Source Hardware and Libre SiliconEmbedded Recipes 2019 - Linux on Open Source Hardware and Libre Silicon
Embedded Recipes 2019 - Linux on Open Source Hardware and Libre Silicon
 
Embedded Recipes 2019 - From maintaining I2C to the big (embedded) picture
Embedded Recipes 2019 - From maintaining I2C to the big (embedded) pictureEmbedded Recipes 2019 - From maintaining I2C to the big (embedded) picture
Embedded Recipes 2019 - From maintaining I2C to the big (embedded) picture
 
Embedded Recipes 2019 - Testing firmware the devops way
Embedded Recipes 2019 - Testing firmware the devops wayEmbedded Recipes 2019 - Testing firmware the devops way
Embedded Recipes 2019 - Testing firmware the devops way
 
Embedded Recipes 2019 - Herd your socs become a matchmaker
Embedded Recipes 2019 - Herd your socs become a matchmakerEmbedded Recipes 2019 - Herd your socs become a matchmaker
Embedded Recipes 2019 - Herd your socs become a matchmaker
 
Embedded Recipes 2019 - LLVM / Clang integration
Embedded Recipes 2019 - LLVM / Clang integrationEmbedded Recipes 2019 - LLVM / Clang integration
Embedded Recipes 2019 - LLVM / Clang integration
 
Embedded Recipes 2019 - Introduction to JTAG debugging
Embedded Recipes 2019 - Introduction to JTAG debuggingEmbedded Recipes 2019 - Introduction to JTAG debugging
Embedded Recipes 2019 - Introduction to JTAG debugging
 
Embedded Recipes 2019 - Pipewire a new foundation for embedded multimedia
Embedded Recipes 2019 - Pipewire a new foundation for embedded multimediaEmbedded Recipes 2019 - Pipewire a new foundation for embedded multimedia
Embedded Recipes 2019 - Pipewire a new foundation for embedded multimedia
 
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all started
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all startedKernel Recipes 2019 - ftrace: Where modifying a running kernel all started
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all started
 
Kernel Recipes 2019 - Suricata and XDP
Kernel Recipes 2019 - Suricata and XDPKernel Recipes 2019 - Suricata and XDP
Kernel Recipes 2019 - Suricata and XDP
 
Kernel Recipes 2019 - Marvels of Memory Auto-configuration (SPD)
Kernel Recipes 2019 - Marvels of Memory Auto-configuration (SPD)Kernel Recipes 2019 - Marvels of Memory Auto-configuration (SPD)
Kernel Recipes 2019 - Marvels of Memory Auto-configuration (SPD)
 

Último

+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
Health
 
Abortion Pill Prices Tembisa [(+27832195400*)] 🏥 Women's Abortion Clinic in T...
Abortion Pill Prices Tembisa [(+27832195400*)] 🏥 Women's Abortion Clinic in T...Abortion Pill Prices Tembisa [(+27832195400*)] 🏥 Women's Abortion Clinic in T...
Abortion Pill Prices Tembisa [(+27832195400*)] 🏥 Women's Abortion Clinic in T...
Medical / Health Care (+971588192166) Mifepristone and Misoprostol tablets 200mg
 
Abortion Pills In Pretoria ](+27832195400*)[ 🏥 Women's Abortion Clinic In Pre...
Abortion Pills In Pretoria ](+27832195400*)[ 🏥 Women's Abortion Clinic In Pre...Abortion Pills In Pretoria ](+27832195400*)[ 🏥 Women's Abortion Clinic In Pre...
Abortion Pills In Pretoria ](+27832195400*)[ 🏥 Women's Abortion Clinic In Pre...
Medical / Health Care (+971588192166) Mifepristone and Misoprostol tablets 200mg
 
The title is not connected to what is inside
The title is not connected to what is insideThe title is not connected to what is inside
The title is not connected to what is inside
shinachiaurasa2
 
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
masabamasaba
 

Último (20)

WSO2Con2024 - From Code To Cloud: Fast Track Your Cloud Native Journey with C...
WSO2Con2024 - From Code To Cloud: Fast Track Your Cloud Native Journey with C...WSO2Con2024 - From Code To Cloud: Fast Track Your Cloud Native Journey with C...
WSO2Con2024 - From Code To Cloud: Fast Track Your Cloud Native Journey with C...
 
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
 
OpenChain - The Ramifications of ISO/IEC 5230 and ISO/IEC 18974 for Legal Pro...
OpenChain - The Ramifications of ISO/IEC 5230 and ISO/IEC 18974 for Legal Pro...OpenChain - The Ramifications of ISO/IEC 5230 and ISO/IEC 18974 for Legal Pro...
OpenChain - The Ramifications of ISO/IEC 5230 and ISO/IEC 18974 for Legal Pro...
 
%in Rustenburg+277-882-255-28 abortion pills for sale in Rustenburg
%in Rustenburg+277-882-255-28 abortion pills for sale in Rustenburg%in Rustenburg+277-882-255-28 abortion pills for sale in Rustenburg
%in Rustenburg+277-882-255-28 abortion pills for sale in Rustenburg
 
Microsoft AI Transformation Partner Playbook.pdf
Microsoft AI Transformation Partner Playbook.pdfMicrosoft AI Transformation Partner Playbook.pdf
Microsoft AI Transformation Partner Playbook.pdf
 
Architecture decision records - How not to get lost in the past
Architecture decision records - How not to get lost in the pastArchitecture decision records - How not to get lost in the past
Architecture decision records - How not to get lost in the past
 
%in Soweto+277-882-255-28 abortion pills for sale in soweto
%in Soweto+277-882-255-28 abortion pills for sale in soweto%in Soweto+277-882-255-28 abortion pills for sale in soweto
%in Soweto+277-882-255-28 abortion pills for sale in soweto
 
Abortion Pill Prices Tembisa [(+27832195400*)] 🏥 Women's Abortion Clinic in T...
Abortion Pill Prices Tembisa [(+27832195400*)] 🏥 Women's Abortion Clinic in T...Abortion Pill Prices Tembisa [(+27832195400*)] 🏥 Women's Abortion Clinic in T...
Abortion Pill Prices Tembisa [(+27832195400*)] 🏥 Women's Abortion Clinic in T...
 
%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein
%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein
%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein
 
Abortion Pills In Pretoria ](+27832195400*)[ 🏥 Women's Abortion Clinic In Pre...
Abortion Pills In Pretoria ](+27832195400*)[ 🏥 Women's Abortion Clinic In Pre...Abortion Pills In Pretoria ](+27832195400*)[ 🏥 Women's Abortion Clinic In Pre...
Abortion Pills In Pretoria ](+27832195400*)[ 🏥 Women's Abortion Clinic In Pre...
 
WSO2CON 2024 - Building the API First Enterprise – Running an API Program, fr...
WSO2CON 2024 - Building the API First Enterprise – Running an API Program, fr...WSO2CON 2024 - Building the API First Enterprise – Running an API Program, fr...
WSO2CON 2024 - Building the API First Enterprise – Running an API Program, fr...
 
MarTech Trend 2024 Book : Marketing Technology Trends (2024 Edition) How Data...
MarTech Trend 2024 Book : Marketing Technology Trends (2024 Edition) How Data...MarTech Trend 2024 Book : Marketing Technology Trends (2024 Edition) How Data...
MarTech Trend 2024 Book : Marketing Technology Trends (2024 Edition) How Data...
 
The title is not connected to what is inside
The title is not connected to what is insideThe title is not connected to what is inside
The title is not connected to what is inside
 
%in Hazyview+277-882-255-28 abortion pills for sale in Hazyview
%in Hazyview+277-882-255-28 abortion pills for sale in Hazyview%in Hazyview+277-882-255-28 abortion pills for sale in Hazyview
%in Hazyview+277-882-255-28 abortion pills for sale in Hazyview
 
8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech students8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech students
 
Shapes for Sharing between Graph Data Spaces - and Epistemic Querying of RDF-...
Shapes for Sharing between Graph Data Spaces - and Epistemic Querying of RDF-...Shapes for Sharing between Graph Data Spaces - and Epistemic Querying of RDF-...
Shapes for Sharing between Graph Data Spaces - and Epistemic Querying of RDF-...
 
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
 
WSO2CON2024 - It's time to go Platformless
WSO2CON2024 - It's time to go PlatformlessWSO2CON2024 - It's time to go Platformless
WSO2CON2024 - It's time to go Platformless
 
%in Midrand+277-882-255-28 abortion pills for sale in midrand
%in Midrand+277-882-255-28 abortion pills for sale in midrand%in Midrand+277-882-255-28 abortion pills for sale in midrand
%in Midrand+277-882-255-28 abortion pills for sale in midrand
 
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
 

Kernel Recipes 2019 - pidfds: Process file descriptors on Linux

  • 1. pidfds Process file descriptors on Linux Christian Brauner christian.brauner@ubuntu.com @brau_ner https://people.kernel.org/brauner https://brauner.io
  • 2. pidfd: what's that? file descriptor referring to a process specifically, an fd referring to a thread-group leader stable, private handle fd guarantees to reference the same process pidfds use pre-existing stable process handle reference struct pid, not task_struct struct pid { refcount_t count; unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; /* wait queue for pidfd notifications */ wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[1]; };
  • 3. Why do this in the first place? pid recycling avoid pitfalls of pid recycling on high-pressure systems CVE-2019-6133: https://bugs.chromium.org/p/project-zero/issues/detail?id=1692 CVE-2014-5033: https://www.cvedetails.com/cve/CVE-2014-5033/ pid-based mac exploits: https://objective-see.com/blog/blog_0x41.html https://doc.qt.io/qt-5/qprocess.html#startDetached https://marc.info/?l=openssl-dev&m=130289811108150&w=2 CVE-2017-13209: (Android - Hardware Service Manager Arbitrary Service Replacement due to getpidcon) https://www.exploit-db.com/exploits/43513 Issue 851: Android: racy getpidcon usage permits binder service replacement https://bugs.chromium.org/p/project-zero/issues/detail?id=851
  • 4. Why do this in the first place? shared libraries allow to spawn invisible helper processes process management delegation hand of a handle to a non-parent process (e.g. for waiting, signaling) ubiquity of fds common patterns already exist everywhere in userspace
  • 5. Does userspace really care about this feature? dbus https://gitlab.freedesktop.org/dbus/dbus/issues/274 qt https://codereview.qt-project.org/c/qt/qtbase/+/108456 systemd https://github.com/systemd/systemd/issues/13101 criu https://github.com/checkpoint-restore/criu/issues/717 lmkd https://android-review.googlesource.com/c/platform/system/core/+/1088157 bpftrace https://github.com/iovisor/bpftrace/issues/880 mio https://github.com/samuelbrian/mio-pidfd
  • 6. Prior art Illumos pure userspace emulation of stable process handle procopen(), procrun(), procclose(), procfree(), etc. OpenBSD, NetBSD no private, stable process handles FreeBSD procdesc: pdfork(), pdgetpid(), pdkill() Linux forkfd(), CLONE_FD
  • 7. Building a new api 4 kernel releases individual elements to create a complete api
  • 8. 5.1 sending signals using pidfds to reliably send signals SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { int ret; struct fd f; struct pid *pid; kernel_siginfo_t kinfo; /* Enforce flags be set to 0 until we add an extension. */ if (flags) return -EINVAL; f = fdget(pidfd); if (!f.file) return -EBADF; /* Is this a pidfd? */ pid = pidfd_to_pid(f.file); if (IS_ERR(pid)) { ret = PTR_ERR(pid); goto err; }
  • 9. 5.2 CLONE_PIDFD create pidfds at process creation time O_CLOEXEC pidfds are close-on-exec by default /proc/<pid>/fd/fdinfo contains pid of process in procfs pidns /* * This has to happen after we've potentially unshared the file * descriptor table (so that the pidfd doesn't leak into the child * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, O_RDWR | O_CLOEXEC); if (IS_ERR(pidfile)) { put_unused_fd(pidfd); retval = PTR_ERR(pidfile); goto bad_fork_free_pid; } get_pid(pid); /* held by pidfile now */ retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; }
  • 10. 5.3 polling support exit notification for non-parents static void do_notify_pidfd(struct task_struct *task) { struct pid *pid; WARN_ON(task->exit_state == 0); pid = task_pid(task); wake_up_all(&pid->wait_pidfd); } /* * Poll support for process exit notification. */ static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct task_struct *task; struct pid *pid = file->private_data; int poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); rcu_read_lock(); task = pid_task(pid, PIDTYPE_PID); /* * Inform pollers only when the whole thread group exits. * If the thread group leader exits before all other threads in the * group, then poll(2) should block, similar to the wait(2) family. */ if (!task || (task->exit_state && thread_group_empty(task))) poll_flags = POLLIN | POLLRDNORM; rcu_read_unlock(); return poll_flags; }
  • 11. 5.3 pidfds without CLONE_PIDFD pidfd_open() to create pidfd SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { int fd, ret; struct pid *p; if (flags) return -EINVAL; if (pid <= 0) return -EINVAL; p = find_get_pid(pid); if (!p) return -ESRCH; ret = 0; rcu_read_lock(); if (!pid_task(p, PIDTYPE_TGID)) ret = -EINVAL; rcu_read_unlock(); fd = ret ?: pidfd_create(p); put_pid(p); return fd; }
  • 12. 5.4 (proposed) waiting through pidfds P_PIDFD for waitid() case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options; wo.wo_info = infop; wo.wo_rusage = ru; ret = do_wait(&wo);
  • 13. Kill-on-close SIGKILL on last close kill process when last fd referencing it is closed
  • 14. exclusive waiting CLONE_WAIT_PID hide process from generic wait requests (e.g. waitid(P_ALL)
  • 15. Lessons learned speed matters choose a sustainable speed for developing features be open about being "dumb" it's ok to say "I don't know" or "I can't review that" be resilient reviews are a form of critique