[PATCH] android: binder: Disable preemption while holding the global binder lock

Fri Sep 9 15:44:23 UTC 2016

On Fri, Sep 09, 2016 at 08:17:44AM -0700, Todd Kjos wrote:
> From: Todd Kjos <tkjos at android.com>
> 
> In Android systems, the display pipeline relies on low
> latency binder transactions and is therefore sensitive to
> delays caused by contention for the global binder lock.
> Jank is significantly reduced by disabling preemption
> while the global binder lock is held.

What is the technical definition of "Jank"?  :)

> 
> This patch was originated by Riley Andrews <riandrews at android.com>
> with tweaks and forward-porting by me.
> 
> Originally-from: Riley Andrews <riandrews at android.com>
> Signed-off-by: Todd Kjos <tkjos at android.com>
> ---
>  drivers/android/binder.c | 194 +++++++++++++++++++++++++++++++++++------------
>  1 file changed, 146 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/android/binder.c b/drivers/android/binder.c
> index 16288e7..c36e420 100644
> --- a/drivers/android/binder.c
> +++ b/drivers/android/binder.c
> @@ -379,6 +379,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
>  	struct files_struct *files = proc->files;
>  	unsigned long rlim_cur;
>  	unsigned long irqs;
> +	int ret;
>  
>  	if (files == NULL)
>  		return -ESRCH;
> @@ -389,7 +390,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
>  	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
>  	unlock_task_sighand(proc->tsk, &irqs);
>  
> -	return __alloc_fd(files, 0, rlim_cur, flags);
> +	preempt_enable_no_resched();
> +	ret = __alloc_fd(files, 0, rlim_cur, flags);
> +	preempt_disable();
> +
> +	return ret;
>  }
>  
>  /*
> @@ -398,8 +403,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
>  static void task_fd_install(
>  	struct binder_proc *proc, unsigned int fd, struct file *file)
>  {
> -	if (proc->files)
> +	if (proc->files) {
> +		preempt_enable_no_resched();
>  		__fd_install(proc->files, fd, file);
> +		preempt_disable();
> +	}
>  }
>  
>  /*
> @@ -427,6 +435,7 @@ static inline void binder_lock(const char *tag)
>  {
>  	trace_binder_lock(tag);
>  	mutex_lock(&binder_main_lock);
> +	preempt_disable();
>  	trace_binder_locked(tag);
>  }
>  
> @@ -434,8 +443,65 @@ static inline void binder_unlock(const char *tag)
>  {
>  	trace_binder_unlock(tag);
>  	mutex_unlock(&binder_main_lock);
> +	preempt_enable();
> +}
> +
> +static inline void *kzalloc_nopreempt(size_t size)
> +{
> +	void *ptr;
> +
> +	ptr = kzalloc(size, GFP_NOWAIT);
> +	if (ptr)
> +		return ptr;
> +
> +	preempt_enable_no_resched();
> +	ptr = kzalloc(size, GFP_KERNEL);
> +	preempt_disable();

Doesn't the allocator retry if the first one fails anyway?  Why not
GFP_NOIO or GFP_ATOMIC?  Have you really hit the second GFP_KERNEL
usage?

> +
> +	return ptr;
> +}
> +
> +static inline long copy_to_user_nopreempt(void __user *to,
> +					  const void *from, long n)
> +{
> +	long ret;
> +
> +	preempt_enable_no_resched();
> +	ret = copy_to_user(to, from, n);
> +	preempt_disable();
> +	return ret;
> +}
> +
> +static inline long copy_from_user_nopreempt(void *to,
> +					    const void __user *from,
> +					    long n)
> +{
> +	long ret;
> +
> +	preempt_enable_no_resched();
> +	ret = copy_from_user(to, from, n);
> +	preempt_disable();
> +	return ret;
>  }
>  
> +#define get_user_nopreempt(x, ptr)	\
> +({						\
> +	int __ret;				\
> +	preempt_enable_no_resched();		\
> +	__ret = get_user(x, ptr);		\
> +	preempt_disable();			\
> +	__ret;					\
> +})
> +
> +#define put_user_nopreempt(x, ptr)	\
> +({						\
> +	int __ret;				\
> +	preempt_enable_no_resched();		\
> +	__ret = put_user(x, ptr);		\
> +	preempt_disable();			\
> +	__ret;					\
> +})

Any reason some of these are #defines and some are static inline
functions?

Anyway, these all seem a bit strange to me, what type of latency spikes
are you seeing that these changes resolve?  Shouldn't that be an issue
with the scheduler more than just the binder driver?

I don't know of any other driver or IPC that does this type of thing
with the scheduler in order to make things "go faster", so it feels
wrong to me, and is probably why we don't have global functions like
put_user_nopreempt() :)

And is enabling and disabling preemption around single byte copies
to/from userspace really a good idea?  That seems like a lot of overhead
you are now adding to your "fastpath" that you need to go even faster.

And finally, I'm guessing this has passed the binder test suite that is
out there for testing binder changes?  If so, can you please mention it
in the changelog text?

thanks,

greg k-h