brcm2708: add linux 4.4 support

[openwrt.git] / target / linux / brcm2708 / patches-4.4 / 0080-Improve-__copy_to_user-and-__copy_from_user-performa.patch
diff --git a/target/linux/brcm2708/patches-4.4/0080-Improve-__copy_to_user-and-__copy_from_user-performa.patch b/target/linux/brcm2708/patches-4.4/0080-Improve-__copy_to_user-and-__copy_from_user-performa.patch

new file mode 100644 (file)

index 0000000..10f1184
--- /dev/null
+++ b/target/linux/brcm2708/patches-4.4/0080-Improve-__copy_to_user-and-__copy_from_user-performa.patch
@@ -0,0 +1,1510 @@
+From 61d24c12473972a4eb6b259f297d65a03fc09bda Mon Sep 17 00:00:00 2001
+From: Phil Elwell <phil@raspberrypi.org>
+Date: Mon, 13 Oct 2014 11:47:53 +0100
+Subject: [PATCH 080/127] Improve __copy_to_user and __copy_from_user
+ performance
+
+Provide a __copy_from_user that uses memcpy. On BCM2708, use
+optimised memcpy/memmove/memcmp/memset implementations.
+
+arch/arm: Add mmiocpy/set aliases for memcpy/set
+
+See: https://github.com/raspberrypi/linux/issues/1082
+---
+ arch/arm/include/asm/string.h      |   5 +
+ arch/arm/include/asm/uaccess.h     |   3 +
+ arch/arm/lib/Makefile              |  15 +-
+ arch/arm/lib/arm-mem.h             | 159 ++++++++++++
+ arch/arm/lib/copy_from_user.S      |   4 +-
+ arch/arm/lib/exports_rpi.c         |  37 +++
+ arch/arm/lib/memcmp_rpi.S          | 285 +++++++++++++++++++++
+ arch/arm/lib/memcpy_rpi.S          |  61 +++++
+ arch/arm/lib/memcpymove.h          | 506 +++++++++++++++++++++++++++++++++++++
+ arch/arm/lib/memmove_rpi.S         |  61 +++++
+ arch/arm/lib/memset_rpi.S          | 123 +++++++++
+ arch/arm/lib/uaccess_with_memcpy.c | 112 +++++++-
+ 12 files changed, 1365 insertions(+), 6 deletions(-)
+ create mode 100644 arch/arm/lib/arm-mem.h
+ create mode 100644 arch/arm/lib/exports_rpi.c
+ create mode 100644 arch/arm/lib/memcmp_rpi.S
+ create mode 100644 arch/arm/lib/memcpy_rpi.S
+ create mode 100644 arch/arm/lib/memcpymove.h
+ create mode 100644 arch/arm/lib/memmove_rpi.S
+ create mode 100644 arch/arm/lib/memset_rpi.S
+
+--- a/arch/arm/include/asm/string.h
++++ b/arch/arm/include/asm/string.h
+@@ -24,6 +24,11 @@ extern void * memchr(const void *, int,
+ #define __HAVE_ARCH_MEMSET
+ extern void * memset(void *, int, __kernel_size_t);
+ 
++#ifdef CONFIG_MACH_BCM2708
++#define __HAVE_ARCH_MEMCMP
++extern int memcmp(const void *, const void *, size_t);
++#endif
++
+ extern void __memzero(void *ptr, __kernel_size_t n);
+ 
+ #define memset(p,v,n)                                                 \
+--- a/arch/arm/include/asm/uaccess.h
++++ b/arch/arm/include/asm/uaccess.h
+@@ -493,6 +493,9 @@ do {                                                                       \
+ extern unsigned long __must_check
+ arm_copy_from_user(void *to, const void __user *from, unsigned long n);
+ 
++extern unsigned long __must_check
++__copy_from_user_std(void *to, const void __user *from, unsigned long n);
++
+ static inline unsigned long __must_check
+ __copy_from_user(void *to, const void __user *from, unsigned long n)
+ {
+--- a/arch/arm/lib/Makefile
++++ b/arch/arm/lib/Makefile
+@@ -6,9 +6,8 @@
+ 
+ lib-y         := backtrace.o changebit.o csumipv6.o csumpartial.o   \
+                  csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
+-                 delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
+-                 memmove.o memset.o memzero.o setbit.o              \
+-                 strchr.o strrchr.o                                 \
++                 delay.o delay-loop.o findbit.o memchr.o memzero.o  \
++                 setbit.o strchr.o strrchr.o                        \
+                  testchangebit.o testclearbit.o testsetbit.o        \
+                  ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
+                  ucmpdi2.o lib1funcs.o div64.o                      \
+@@ -18,6 +17,16 @@ lib-y               := backtrace.o changebit.o csumip
+ mmu-y         := clear_user.o copy_page.o getuser.o putuser.o       \
+                  copy_from_user.o copy_to_user.o
+ 
++# Choose optimised implementations for Raspberry Pi
++ifeq ($(CONFIG_MACH_BCM2708),y)
++  CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
++  CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
++  obj-$(CONFIG_MODULES) += exports_rpi.o
++  lib-y        += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
++else
++  lib-y        += memcpy.o memmove.o memset.o
++endif
++
+ # using lib_ here won't override already available weak symbols
+ obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
+ 
+--- /dev/null
++++ b/arch/arm/lib/arm-mem.h
+@@ -0,0 +1,159 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++.macro myfunc fname
++ .func fname
++ .global fname
++fname:
++.endm
++
++.macro preload_leading_step1  backwards, ptr, base
++/* If the destination is already 16-byte aligned, then we need to preload
++ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
++ * are no gaps when the inner loop starts.
++ */
++ .if backwards
++        sub     ptr, base, #1
++        bic     ptr, ptr, #31
++ .else
++        bic     ptr, base, #31
++ .endif
++ .set OFFSET, 0
++ .rept prefetch_distance+1
++        pld     [ptr, #OFFSET]
++  .if backwards
++   .set OFFSET, OFFSET-32
++  .else
++   .set OFFSET, OFFSET+32
++  .endif
++ .endr
++.endm
++
++.macro preload_leading_step2  backwards, ptr, base, leading_bytes, tmp
++/* However, if the destination is not 16-byte aligned, we may need to
++ * preload one more cache line than that. The question we need to ask is:
++ * are the leading bytes more than the amount by which the source
++ * pointer will be rounded down for preloading, and if so, by how many
++ * cache lines?
++ */
++ .if backwards
++/* Here we compare against how many bytes we are into the
++ * cache line, counting down from the highest such address.
++ * Effectively, we want to calculate
++ *     leading_bytes = dst&15
++ *     cacheline_offset = 31-((src-leading_bytes-1)&31)
++ *     extra_needed = leading_bytes - cacheline_offset
++ * and test if extra_needed is <= 0, or rearranging:
++ *     leading_bytes + (src-leading_bytes-1)&31 <= 31
++ */
++        mov     tmp, base, lsl #32-5
++        sbc     tmp, tmp, leading_bytes, lsl #32-5
++        adds    tmp, tmp, leading_bytes, lsl #32-5
++        bcc     61f
++        pld     [ptr, #-32*(prefetch_distance+1)]
++ .else
++/* Effectively, we want to calculate
++ *     leading_bytes = (-dst)&15
++ *     cacheline_offset = (src+leading_bytes)&31
++ *     extra_needed = leading_bytes - cacheline_offset
++ * and test if extra_needed is <= 0.
++ */
++        mov     tmp, base, lsl #32-5
++        add     tmp, tmp, leading_bytes, lsl #32-5
++        rsbs    tmp, tmp, leading_bytes, lsl #32-5
++        bls     61f
++        pld     [ptr, #32*(prefetch_distance+1)]
++ .endif
++61:
++.endm
++
++.macro preload_trailing  backwards, base, remain, tmp
++        /* We need either 0, 1 or 2 extra preloads */
++ .if backwards
++        rsb     tmp, base, #0
++        mov     tmp, tmp, lsl #32-5
++ .else
++        mov     tmp, base, lsl #32-5
++ .endif
++        adds    tmp, tmp, remain, lsl #32-5
++        adceqs  tmp, tmp, #0
++        /* The instruction above has two effects: ensures Z is only
++         * set if C was clear (so Z indicates that both shifted quantities
++         * were 0), and clears C if Z was set (so C indicates that the sum
++         * of the shifted quantities was greater and not equal to 32) */
++        beq     82f
++ .if backwards
++        sub     tmp, base, #1
++        bic     tmp, tmp, #31
++ .else
++        bic     tmp, base, #31
++ .endif
++        bcc     81f
++ .if backwards
++        pld     [tmp, #-32*(prefetch_distance+1)]
++81:
++        pld     [tmp, #-32*prefetch_distance]
++ .else
++        pld     [tmp, #32*(prefetch_distance+2)]
++81:
++        pld     [tmp, #32*(prefetch_distance+1)]
++ .endif
++82:
++.endm
++
++.macro preload_all    backwards, narrow_case, shift, base, remain, tmp0, tmp1
++ .if backwards
++        sub     tmp0, base, #1
++        bic     tmp0, tmp0, #31
++        pld     [tmp0]
++        sub     tmp1, base, remain, lsl #shift
++ .else
++        bic     tmp0, base, #31
++        pld     [tmp0]
++        add     tmp1, base, remain, lsl #shift
++        sub     tmp1, tmp1, #1
++ .endif
++        bic     tmp1, tmp1, #31
++        cmp     tmp1, tmp0
++        beq     92f
++ .if narrow_case
++        /* In this case, all the data fits in either 1 or 2 cache lines */
++        pld     [tmp1]
++ .else
++91:
++  .if backwards
++        sub     tmp0, tmp0, #32
++  .else
++        add     tmp0, tmp0, #32
++  .endif
++        cmp     tmp0, tmp1
++        pld     [tmp0]
++        bne     91b
++ .endif
++92:
++.endm
+--- a/arch/arm/lib/copy_from_user.S
++++ b/arch/arm/lib/copy_from_user.S
+@@ -89,11 +89,13 @@
+ 
+       .text
+ 
+-ENTRY(arm_copy_from_user)
++ENTRY(__copy_from_user_std)
++WEAK(arm_copy_from_user)
+ 
+ #include "copy_template.S"
+ 
+ ENDPROC(arm_copy_from_user)
++ENDPROC(__copy_from_user_std)
+ 
+       .pushsection .fixup,"ax"
+       .align 0
+--- /dev/null
++++ b/arch/arm/lib/exports_rpi.c
+@@ -0,0 +1,37 @@
++/**
++ * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ *    notice, this list of conditions, and the following disclaimer,
++ *    without modification.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ *    notice, this list of conditions and the following disclaimer in the
++ *    documentation and/or other materials provided with the distribution.
++ * 3. The names of the above-listed copyright holders may not be used
++ *    to endorse or promote products derived from this software without
++ *    specific prior written permission.
++ *
++ * ALTERNATIVELY, this software may be distributed under the terms of the
++ * GNU General Public License ("GPL") version 2, as published by the Free
++ * Software Foundation.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
++ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
++ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++EXPORT_SYMBOL(memcmp);
+--- /dev/null
++++ b/arch/arm/lib/memcmp_rpi.S
+@@ -0,0 +1,285 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++    .text
++    .arch armv6
++    .object_arch armv4
++    .arm
++    .altmacro
++    .p2align 2
++
++.macro memcmp_process_head  unaligned
++ .if unaligned
++        ldr     DAT0, [S_1], #4
++        ldr     DAT1, [S_1], #4
++        ldr     DAT2, [S_1], #4
++        ldr     DAT3, [S_1], #4
++ .else
++        ldmia   S_1!, {DAT0, DAT1, DAT2, DAT3}
++ .endif
++        ldmia   S_2!, {DAT4, DAT5, DAT6, DAT7}
++.endm
++
++.macro memcmp_process_tail
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        cmpeq   DAT2, DAT6
++        cmpeq   DAT3, DAT7
++        bne     200f
++.endm
++
++.macro memcmp_leading_31bytes
++        movs    DAT0, OFF, lsl #31
++        ldrmib  DAT0, [S_1], #1
++        ldrcsh  DAT1, [S_1], #2
++        ldrmib  DAT4, [S_2], #1
++        ldrcsh  DAT5, [S_2], #2
++        movpl   DAT0, #0
++        movcc   DAT1, #0
++        movpl   DAT4, #0
++        movcc   DAT5, #0
++        submi   N, N, #1
++        subcs   N, N, #2
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        bne     200f
++        movs    DAT0, OFF, lsl #29
++        ldrmi   DAT0, [S_1], #4
++        ldrcs   DAT1, [S_1], #4
++        ldrcs   DAT2, [S_1], #4
++        ldrmi   DAT4, [S_2], #4
++        ldmcsia S_2!, {DAT5, DAT6}
++        movpl   DAT0, #0
++        movcc   DAT1, #0
++        movcc   DAT2, #0
++        movpl   DAT4, #0
++        movcc   DAT5, #0
++        movcc   DAT6, #0
++        submi   N, N, #4
++        subcs   N, N, #8
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        cmpeq   DAT2, DAT6
++        bne     200f
++        tst     OFF, #16
++        beq     105f
++        memcmp_process_head  1
++        sub     N, N, #16
++        memcmp_process_tail
++105:
++.endm
++
++.macro memcmp_trailing_15bytes  unaligned
++        movs    N, N, lsl #29
++ .if unaligned
++        ldrcs   DAT0, [S_1], #4
++        ldrcs   DAT1, [S_1], #4
++ .else
++        ldmcsia S_1!, {DAT0, DAT1}
++ .endif
++        ldrmi   DAT2, [S_1], #4
++        ldmcsia S_2!, {DAT4, DAT5}
++        ldrmi   DAT6, [S_2], #4
++        movcc   DAT0, #0
++        movcc   DAT1, #0
++        movpl   DAT2, #0
++        movcc   DAT4, #0
++        movcc   DAT5, #0
++        movpl   DAT6, #0
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        cmpeq   DAT2, DAT6
++        bne     200f
++        movs    N, N, lsl #2
++        ldrcsh  DAT0, [S_1], #2
++        ldrmib  DAT1, [S_1]
++        ldrcsh  DAT4, [S_2], #2
++        ldrmib  DAT5, [S_2]
++        movcc   DAT0, #0
++        movpl   DAT1, #0
++        movcc   DAT4, #0
++        movpl   DAT5, #0
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        bne     200f
++.endm
++
++.macro memcmp_long_inner_loop  unaligned
++110:
++        memcmp_process_head  unaligned
++        pld     [S_2, #prefetch_distance*32 + 16]
++        memcmp_process_tail
++        memcmp_process_head  unaligned
++        pld     [S_1, OFF]
++        memcmp_process_tail
++        subs    N, N, #32
++        bhs     110b
++        /* Just before the final (prefetch_distance+1) 32-byte blocks,
++         * deal with final preloads */
++        preload_trailing  0, S_1, N, DAT0
++        preload_trailing  0, S_2, N, DAT0
++        add     N, N, #(prefetch_distance+2)*32 - 16
++120:
++        memcmp_process_head  unaligned
++        memcmp_process_tail
++        subs    N, N, #16
++        bhs     120b
++        /* Trailing words and bytes */
++        tst     N, #15
++        beq     199f
++        memcmp_trailing_15bytes  unaligned
++199:    /* Reached end without detecting a difference */
++        mov     a1, #0
++        setend  le
++        pop     {DAT1-DAT6, pc}
++.endm
++
++.macro memcmp_short_inner_loop  unaligned
++        subs    N, N, #16     /* simplifies inner loop termination */
++        blo     122f
++120:
++        memcmp_process_head  unaligned
++        memcmp_process_tail
++        subs    N, N, #16
++        bhs     120b
++122:    /* Trailing words and bytes */
++        tst     N, #15
++        beq     199f
++        memcmp_trailing_15bytes  unaligned
++199:    /* Reached end without detecting a difference */
++        mov     a1, #0
++        setend  le
++        pop     {DAT1-DAT6, pc}
++.endm
++
++/*
++ * int memcmp(const void *s1, const void *s2, size_t n);
++ * On entry:
++ * a1 = pointer to buffer 1
++ * a2 = pointer to buffer 2
++ * a3 = number of bytes to compare (as unsigned chars)
++ * On exit:
++ * a1 = >0/=0/<0 if s1 >/=/< s2
++ */
++
++.set prefetch_distance, 2
++
++ENTRY(memcmp)
++        S_1     .req    a1
++        S_2     .req    a2
++        N       .req    a3
++        DAT0    .req    a4
++        DAT1    .req    v1
++        DAT2    .req    v2
++        DAT3    .req    v3
++        DAT4    .req    v4
++        DAT5    .req    v5
++        DAT6    .req    v6
++        DAT7    .req    ip
++        OFF     .req    lr
++
++        push    {DAT1-DAT6, lr}
++        setend  be /* lowest-addressed bytes are most significant */
++
++        /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
++        cmp     N, #(prefetch_distance+3)*32 - 1
++        blo     170f
++
++        /* Long case */
++        /* Adjust N so that the decrement instruction can also test for
++         * inner loop termination. We want it to stop when there are
++         * (prefetch_distance+1) complete blocks to go. */
++        sub     N, N, #(prefetch_distance+2)*32
++        preload_leading_step1  0, DAT0, S_1
++        preload_leading_step1  0, DAT1, S_2
++        tst     S_2, #31
++        beq     154f
++        rsb     OFF, S_2, #0 /* no need to AND with 15 here */
++        preload_leading_step2  0, DAT0, S_1, OFF, DAT2
++        preload_leading_step2  0, DAT1, S_2, OFF, DAT2
++        memcmp_leading_31bytes
++154:    /* Second source now cacheline (32-byte) aligned; we have at
++         * least one prefetch to go. */
++        /* Prefetch offset is best selected such that it lies in the
++         * first 8 of each 32 bytes - but it's just as easy to aim for
++         * the first one */
++        and     OFF, S_1, #31
++        rsb     OFF, OFF, #32*prefetch_distance
++        tst     S_1, #3
++        bne     140f
++        memcmp_long_inner_loop  0
++140:    memcmp_long_inner_loop  1
++
++170:    /* Short case */
++        teq     N, #0
++        beq     199f
++        preload_all 0, 0, 0, S_1, N, DAT0, DAT1
++        preload_all 0, 0, 0, S_2, N, DAT0, DAT1
++        tst     S_2, #3
++        beq     174f
++172:    subs    N, N, #1
++        blo     199f
++        ldrb    DAT0, [S_1], #1
++        ldrb    DAT4, [S_2], #1
++        cmp     DAT0, DAT4
++        bne     200f
++        tst     S_2, #3
++        bne     172b
++174:    /* Second source now 4-byte aligned; we have 0 or more bytes to go */
++        tst     S_1, #3
++        bne     140f
++        memcmp_short_inner_loop  0
++140:    memcmp_short_inner_loop  1
++
++200:    /* Difference found: determine sign. */
++        movhi   a1, #1
++        movlo   a1, #-1
++        setend  le
++        pop     {DAT1-DAT6, pc}
++
++        .unreq  S_1
++        .unreq  S_2
++        .unreq  N
++        .unreq  DAT0
++        .unreq  DAT1
++        .unreq  DAT2
++        .unreq  DAT3
++        .unreq  DAT4
++        .unreq  DAT5
++        .unreq  DAT6
++        .unreq  DAT7
++        .unreq  OFF
++ENDPROC(memcmp)
+--- /dev/null
++++ b/arch/arm/lib/memcpy_rpi.S
+@@ -0,0 +1,61 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++#include "memcpymove.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++    .text
++    .arch armv6
++    .object_arch armv4
++    .arm
++    .altmacro
++    .p2align 2
++
++/*
++ * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
++ * On entry:
++ * a1 = pointer to destination
++ * a2 = pointer to source
++ * a3 = number of bytes to copy
++ * On exit:
++ * a1 preserved
++ */
++
++.set prefetch_distance, 3
++
++ENTRY(mmiocpy)
++ENTRY(memcpy)
++        memcpy  0
++ENDPROC(memcpy)
++ENDPROC(mmiocpy)
+--- /dev/null
++++ b/arch/arm/lib/memcpymove.h
+@@ -0,0 +1,506 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++.macro unaligned_words  backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
++ .if words == 1
++  .if backwards
++        mov     r1, r0, lsl #32-align*8
++        ldr     r0, [S, #-4]!
++        orr     r1, r1, r0, lsr #align*8
++        str     r1, [D, #-4]!
++  .else
++        mov     r0, r1, lsr #align*8
++        ldr     r1, [S, #4]!
++        orr     r0, r0, r1, lsl #32-align*8
++        str     r0, [D], #4
++  .endif
++ .elseif words == 2
++  .if backwards
++        ldr     r1, [S, #-4]!
++        mov     r2, r0, lsl #32-align*8
++        ldr     r0, [S, #-4]!
++        orr     r2, r2, r1, lsr #align*8
++        mov     r1, r1, lsl #32-align*8
++        orr     r1, r1, r0, lsr #align*8
++        stmdb   D!, {r1, r2}
++  .else
++        ldr     r1, [S, #4]!
++        mov     r0, r2, lsr #align*8
++        ldr     r2, [S, #4]!
++        orr     r0, r0, r1, lsl #32-align*8
++        mov     r1, r1, lsr #align*8
++        orr     r1, r1, r2, lsl #32-align*8
++        stmia   D!, {r0, r1}
++  .endif
++ .elseif words == 4
++  .if backwards
++        ldmdb   S!, {r2, r3}
++        mov     r4, r0, lsl #32-align*8
++        ldmdb   S!, {r0, r1}
++        orr     r4, r4, r3, lsr #align*8
++        mov     r3, r3, lsl #32-align*8
++        orr     r3, r3, r2, lsr #align*8
++        mov     r2, r2, lsl #32-align*8
++        orr     r2, r2, r1, lsr #align*8
++        mov     r1, r1, lsl #32-align*8
++        orr     r1, r1, r0, lsr #align*8
++        stmdb   D!, {r1, r2, r3, r4}
++  .else
++        ldmib   S!, {r1, r2}
++        mov     r0, r4, lsr #align*8
++        ldmib   S!, {r3, r4}
++        orr     r0, r0, r1, lsl #32-align*8
++        mov     r1, r1, lsr #align*8
++        orr     r1, r1, r2, lsl #32-align*8
++        mov     r2, r2, lsr #align*8
++        orr     r2, r2, r3, lsl #32-align*8
++        mov     r3, r3, lsr #align*8
++        orr     r3, r3, r4, lsl #32-align*8
++        stmia   D!, {r0, r1, r2, r3}
++  .endif
++ .elseif words == 8
++  .if backwards
++        ldmdb   S!, {r4, r5, r6, r7}
++        mov     r8, r0, lsl #32-align*8
++        ldmdb   S!, {r0, r1, r2, r3}
++   .if use_pld
++        pld     [S, OFF]
++   .endif
++        orr     r8, r8, r7, lsr #align*8
++        mov     r7, r7, lsl #32-align*8
++        orr     r7, r7, r6, lsr #align*8
++        mov     r6, r6, lsl #32-align*8
++        orr     r6, r6, r5, lsr #align*8
++        mov     r5, r5, lsl #32-align*8
++        orr     r5, r5, r4, lsr #align*8
++        mov     r4, r4, lsl #32-align*8
++        orr     r4, r4, r3, lsr #align*8
++        mov     r3, r3, lsl #32-align*8
++        orr     r3, r3, r2, lsr #align*8
++        mov     r2, r2, lsl #32-align*8
++        orr     r2, r2, r1, lsr #align*8
++        mov     r1, r1, lsl #32-align*8
++        orr     r1, r1, r0, lsr #align*8
++        stmdb   D!, {r5, r6, r7, r8}
++        stmdb   D!, {r1, r2, r3, r4}
++  .else
++        ldmib   S!, {r1, r2, r3, r4}
++        mov     r0, r8, lsr #align*8
++        ldmib   S!, {r5, r6, r7, r8}
++   .if use_pld
++        pld     [S, OFF]
++   .endif
++        orr     r0, r0, r1, lsl #32-align*8
++        mov     r1, r1, lsr #align*8
++        orr     r1, r1, r2, lsl #32-align*8
++        mov     r2, r2, lsr #align*8
++        orr     r2, r2, r3, lsl #32-align*8
++        mov     r3, r3, lsr #align*8
++        orr     r3, r3, r4, lsl #32-align*8
++        mov     r4, r4, lsr #align*8
++        orr     r4, r4, r5, lsl #32-align*8
++        mov     r5, r5, lsr #align*8
++        orr     r5, r5, r6, lsl #32-align*8
++        mov     r6, r6, lsr #align*8
++        orr     r6, r6, r7, lsl #32-align*8
++        mov     r7, r7, lsr #align*8
++        orr     r7, r7, r8, lsl #32-align*8
++        stmia   D!, {r0, r1, r2, r3}
++        stmia   D!, {r4, r5, r6, r7}
++  .endif
++ .endif
++.endm
++
++.macro memcpy_leading_15bytes  backwards, align
++        movs    DAT1, DAT2, lsl #31
++        sub     N, N, DAT2
++ .if backwards
++        ldrmib  DAT0, [S, #-1]!
++        ldrcsh  DAT1, [S, #-2]!
++        strmib  DAT0, [D, #-1]!
++        strcsh  DAT1, [D, #-2]!
++ .else
++        ldrmib  DAT0, [S], #1
++        ldrcsh  DAT1, [S], #2
++        strmib  DAT0, [D], #1
++        strcsh  DAT1, [D], #2
++ .endif
++        movs    DAT1, DAT2, lsl #29
++ .if backwards
++        ldrmi   DAT0, [S, #-4]!
++  .if align == 0
++        ldmcsdb S!, {DAT1, DAT2}
++  .else
++        ldrcs   DAT2, [S, #-4]!
++        ldrcs   DAT1, [S, #-4]!
++  .endif
++        strmi   DAT0, [D, #-4]!
++        stmcsdb D!, {DAT1, DAT2}
++ .else
++        ldrmi   DAT0, [S], #4
++  .if align == 0
++        ldmcsia S!, {DAT1, DAT2}
++  .else
++        ldrcs   DAT1, [S], #4
++        ldrcs   DAT2, [S], #4
++  .endif
++        strmi   DAT0, [D], #4
++        stmcsia D!, {DAT1, DAT2}
++ .endif
++.endm
++
++.macro memcpy_trailing_15bytes  backwards, align
++        movs    N, N, lsl #29
++ .if backwards
++  .if align == 0
++        ldmcsdb S!, {DAT0, DAT1}
++  .else
++        ldrcs   DAT1, [S, #-4]!
++        ldrcs   DAT0, [S, #-4]!
++  .endif
++        ldrmi   DAT2, [S, #-4]!
++        stmcsdb D!, {DAT0, DAT1}
++        strmi   DAT2, [D, #-4]!
++ .else
++  .if align == 0
++        ldmcsia S!, {DAT0, DAT1}
++  .else
++        ldrcs   DAT0, [S], #4
++        ldrcs   DAT1, [S], #4
++  .endif
++        ldrmi   DAT2, [S], #4
++        stmcsia D!, {DAT0, DAT1}
++        strmi   DAT2, [D], #4
++ .endif
++        movs    N, N, lsl #2
++ .if backwards
++        ldrcsh  DAT0, [S, #-2]!
++        ldrmib  DAT1, [S, #-1]
++        strcsh  DAT0, [D, #-2]!
++        strmib  DAT1, [D, #-1]
++ .else
++        ldrcsh  DAT0, [S], #2
++        ldrmib  DAT1, [S]
++        strcsh  DAT0, [D], #2
++        strmib  DAT1, [D]
++ .endif
++.endm
++
++.macro memcpy_long_inner_loop  backwards, align
++ .if align != 0
++  .if backwards
++        ldr     DAT0, [S, #-align]!
++  .else
++        ldr     LAST, [S, #-align]!
++  .endif
++ .endif
++110:
++ .if align == 0
++  .if backwards
++        ldmdb   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++        pld     [S, OFF]
++        stmdb   D!, {DAT4, DAT5, DAT6, LAST}
++        stmdb   D!, {DAT0, DAT1, DAT2, DAT3}
++  .else
++        ldmia   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++        pld     [S, OFF]
++        stmia   D!, {DAT0, DAT1, DAT2, DAT3}
++        stmia   D!, {DAT4, DAT5, DAT6, LAST}
++  .endif
++ .else
++        unaligned_words  backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
++ .endif
++        subs    N, N, #32
++        bhs     110b
++        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
++        preload_trailing  backwards, S, N, OFF
++        add     N, N, #(prefetch_distance+2)*32 - 32
++120:
++ .if align == 0
++  .if backwards
++        ldmdb   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++        stmdb   D!, {DAT4, DAT5, DAT6, LAST}
++        stmdb   D!, {DAT0, DAT1, DAT2, DAT3}
++  .else
++        ldmia   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++        stmia   D!, {DAT0, DAT1, DAT2, DAT3}
++        stmia   D!, {DAT4, DAT5, DAT6, LAST}
++  .endif
++ .else
++        unaligned_words  backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
++ .endif
++        subs    N, N, #32
++        bhs     120b
++        tst     N, #16
++ .if align == 0
++  .if backwards
++        ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
++        stmnedb D!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldmneia S!, {DAT0, DAT1, DAT2, LAST}
++        stmneia D!, {DAT0, DAT1, DAT2, LAST}
++  .endif
++ .else
++        beq     130f
++        unaligned_words  backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
++130:
++ .endif
++        /* Trailing words and bytes */
++        tst      N, #15
++        beq      199f
++ .if align != 0
++        add     S, S, #align
++ .endif
++        memcpy_trailing_15bytes  backwards, align
++199:
++        pop     {DAT3, DAT4, DAT5, DAT6, DAT7}
++        pop     {D, DAT1, DAT2, pc}
++.endm
++
++.macro memcpy_medium_inner_loop  backwards, align
++120:
++ .if backwards
++  .if align == 0
++        ldmdb   S!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldr     LAST, [S, #-4]!
++        ldr     DAT2, [S, #-4]!
++        ldr     DAT1, [S, #-4]!
++        ldr     DAT0, [S, #-4]!
++  .endif
++        stmdb   D!, {DAT0, DAT1, DAT2, LAST}
++ .else
++  .if align == 0
++        ldmia   S!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldr     DAT0, [S], #4
++        ldr     DAT1, [S], #4
++        ldr     DAT2, [S], #4
++        ldr     LAST, [S], #4
++  .endif
++        stmia   D!, {DAT0, DAT1, DAT2, LAST}
++ .endif
++        subs     N, N, #16
++        bhs      120b
++        /* Trailing words and bytes */
++        tst      N, #15
++        beq      199f
++        memcpy_trailing_15bytes  backwards, align
++199:
++        pop     {D, DAT1, DAT2, pc}
++.endm
++
++.macro memcpy_short_inner_loop  backwards, align
++        tst     N, #16
++ .if backwards
++  .if align == 0
++        ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldrne   LAST, [S, #-4]!
++        ldrne   DAT2, [S, #-4]!
++        ldrne   DAT1, [S, #-4]!
++        ldrne   DAT0, [S, #-4]!
++  .endif
++        stmnedb D!, {DAT0, DAT1, DAT2, LAST}
++ .else
++  .if align == 0
++        ldmneia S!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldrne   DAT0, [S], #4
++        ldrne   DAT1, [S], #4
++        ldrne   DAT2, [S], #4
++        ldrne   LAST, [S], #4
++  .endif
++        stmneia D!, {DAT0, DAT1, DAT2, LAST}
++ .endif
++        memcpy_trailing_15bytes  backwards, align
++199:
++        pop     {D, DAT1, DAT2, pc}
++.endm
++
++.macro memcpy backwards
++        D       .req    a1
++        S       .req    a2
++        N       .req    a3
++        DAT0    .req    a4
++        DAT1    .req    v1
++        DAT2    .req    v2
++        DAT3    .req    v3
++        DAT4    .req    v4
++        DAT5    .req    v5
++        DAT6    .req    v6
++        DAT7    .req    sl
++        LAST    .req    ip
++        OFF     .req    lr
++
++        .cfi_startproc
++
++        push    {D, DAT1, DAT2, lr}
++
++        .cfi_def_cfa_offset 16
++        .cfi_rel_offset D, 0
++        .cfi_undefined  S
++        .cfi_undefined  N
++        .cfi_undefined  DAT0
++        .cfi_rel_offset DAT1, 4
++        .cfi_rel_offset DAT2, 8
++        .cfi_undefined  LAST
++        .cfi_rel_offset lr, 12
++
++ .if backwards
++        add     D, D, N
++        add     S, S, N
++ .endif
++
++        /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
++        cmp     N, #31
++        blo     170f
++        /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
++        cmp     N, #(prefetch_distance+3)*32 - 1
++        blo     160f
++
++        /* Long case */
++        push    {DAT3, DAT4, DAT5, DAT6, DAT7}
++
++        .cfi_def_cfa_offset 36
++        .cfi_rel_offset D, 20
++        .cfi_rel_offset DAT1, 24
++        .cfi_rel_offset DAT2, 28
++        .cfi_rel_offset DAT3, 0
++        .cfi_rel_offset DAT4, 4
++        .cfi_rel_offset DAT5, 8
++        .cfi_rel_offset DAT6, 12
++        .cfi_rel_offset DAT7, 16
++        .cfi_rel_offset lr, 32
++
++        /* Adjust N so that the decrement instruction can also test for
++         * inner loop termination. We want it to stop when there are
++         * (prefetch_distance+1) complete blocks to go. */
++        sub     N, N, #(prefetch_distance+2)*32
++        preload_leading_step1  backwards, DAT0, S
++ .if backwards
++        /* Bug in GAS: it accepts, but mis-assembles the instruction
++         * ands    DAT2, D, #60, 2
++         * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
++         */
++        .word   0xE210513C
++        beq     154f
++ .else
++        ands    DAT2, D, #15
++        beq     154f
++        rsb     DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
++ .endif
++        preload_leading_step2  backwards, DAT0, S, DAT2, OFF
++        memcpy_leading_15bytes backwards, 1
++154:    /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
++        /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
++ .if backwards
++        rsb     OFF, S, #3
++        and     OFF, OFF, #28
++        sub     OFF, OFF, #32*(prefetch_distance+1)
++ .else
++        and     OFF, S, #28
++        rsb     OFF, OFF, #32*prefetch_distance
++ .endif
++        movs    DAT0, S, lsl #31
++        bhi     157f
++        bcs     156f
++        bmi     155f
++        memcpy_long_inner_loop  backwards, 0
++155:    memcpy_long_inner_loop  backwards, 1
++156:    memcpy_long_inner_loop  backwards, 2
++157:    memcpy_long_inner_loop  backwards, 3
++
++        .cfi_def_cfa_offset 16
++        .cfi_rel_offset D, 0
++        .cfi_rel_offset DAT1, 4
++        .cfi_rel_offset DAT2, 8
++        .cfi_same_value DAT3
++        .cfi_same_value DAT4
++        .cfi_same_value DAT5
++        .cfi_same_value DAT6
++        .cfi_same_value DAT7
++        .cfi_rel_offset lr, 12
++
++160:    /* Medium case */
++        preload_all  backwards, 0, 0, S, N, DAT2, OFF
++        sub     N, N, #16     /* simplifies inner loop termination */
++ .if backwards
++        ands    DAT2, D, #15
++        beq     164f
++ .else
++        ands    DAT2, D, #15
++        beq     164f
++        rsb     DAT2, DAT2, #16
++ .endif
++        memcpy_leading_15bytes backwards, align
++164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
++        tst     S, #3
++        bne     140f
++        memcpy_medium_inner_loop  backwards, 0
++140:    memcpy_medium_inner_loop  backwards, 1
++
++170:    /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
++        teq     N, #0
++        beq     199f
++        preload_all  backwards, 1, 0, S, N, DAT2, LAST
++        tst     D, #3
++        beq     174f
++172:    subs    N, N, #1
++        blo     199f
++ .if backwards
++        ldrb    DAT0, [S, #-1]!
++        strb    DAT0, [D, #-1]!
++ .else
++        ldrb    DAT0, [S], #1
++        strb    DAT0, [D], #1
++ .endif
++        tst     D, #3
++        bne     172b
++174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
++        tst     S, #3
++        bne     140f
++        memcpy_short_inner_loop  backwards, 0
++140:    memcpy_short_inner_loop  backwards, 1
++
++        .cfi_endproc
++
++        .unreq  D
++        .unreq  S
++        .unreq  N
++        .unreq  DAT0
++        .unreq  DAT1
++        .unreq  DAT2
++        .unreq  DAT3
++        .unreq  DAT4
++        .unreq  DAT5
++        .unreq  DAT6
++        .unreq  DAT7
++        .unreq  LAST
++        .unreq  OFF
++.endm
+--- /dev/null
++++ b/arch/arm/lib/memmove_rpi.S
+@@ -0,0 +1,61 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++#include "memcpymove.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++    .text
++    .arch armv6
++    .object_arch armv4
++    .arm
++    .altmacro
++    .p2align 2
++
++/*
++ * void *memmove(void *s1, const void *s2, size_t n);
++ * On entry:
++ * a1 = pointer to destination
++ * a2 = pointer to source
++ * a3 = number of bytes to copy
++ * On exit:
++ * a1 preserved
++ */
++
++.set prefetch_distance, 3
++
++ENTRY(memmove)
++        cmp     a2, a1
++        bpl     memcpy  /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
++        memcpy  1
++ENDPROC(memmove)
+--- /dev/null
++++ b/arch/arm/lib/memset_rpi.S
+@@ -0,0 +1,123 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++    .text
++    .arch armv6
++    .object_arch armv4
++    .arm
++    .altmacro
++    .p2align 2
++
++/*
++ *  void *memset(void *s, int c, size_t n);
++ *  On entry:
++ *  a1 = pointer to buffer to fill
++ *  a2 = byte pattern to fill with (caller-narrowed)
++ *  a3 = number of bytes to fill
++ *  On exit:
++ *  a1 preserved
++ */
++ENTRY(mmioset)
++ENTRY(memset)
++        S       .req    a1
++        DAT0    .req    a2
++        N       .req    a3
++        DAT1    .req    a4
++        DAT2    .req    ip
++        DAT3    .req    lr
++
++        orr     DAT0, DAT0, lsl #8
++        push    {S, lr}
++        orr     DAT0, DAT0, lsl #16
++        mov     DAT1, DAT0
++
++        /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
++        cmp     N, #31
++        blo     170f
++
++161:    sub     N, N, #16     /* simplifies inner loop termination */
++        /* Leading words and bytes */
++        tst     S, #15
++        beq     164f
++        rsb     DAT3, S, #0   /* bits 0-3 = number of leading bytes until aligned */
++        movs    DAT2, DAT3, lsl #31
++        submi   N, N, #1
++        strmib  DAT0, [S], #1
++        subcs   N, N, #2
++        strcsh  DAT0, [S], #2
++        movs    DAT2, DAT3, lsl #29
++        submi   N, N, #4
++        strmi   DAT0, [S], #4
++        subcs   N, N, #8
++        stmcsia S!, {DAT0, DAT1}
++164:    /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
++        mov     DAT2, DAT0
++        mov     DAT3, DAT0
++        /* Now the inner loop of 16-byte stores */
++165:    stmia   S!, {DAT0, DAT1, DAT2, DAT3}
++        subs    N, N, #16
++        bhs     165b
++166:    /* Trailing words and bytes */
++        movs    N, N, lsl #29
++        stmcsia S!, {DAT0, DAT1}
++        strmi   DAT0, [S], #4
++        movs    N, N, lsl #2
++        strcsh  DAT0, [S], #2
++        strmib  DAT0, [S]
++199:    pop     {S, pc}
++
++170:    /* Short case */
++        mov     DAT2, DAT0
++        mov     DAT3, DAT0
++        tst     S, #3
++        beq     174f
++172:    subs    N, N, #1
++        blo     199b
++        strb    DAT0, [S], #1
++        tst     S, #3
++        bne     172b
++174:    tst     N, #16
++        stmneia S!, {DAT0, DAT1, DAT2, DAT3}
++        b       166b
++
++        .unreq  S
++        .unreq  DAT0
++        .unreq  N
++        .unreq  DAT1
++        .unreq  DAT2
++        .unreq  DAT3
++ENDPROC(memset)
++ENDPROC(mmioset)
+--- a/arch/arm/lib/uaccess_with_memcpy.c
++++ b/arch/arm/lib/uaccess_with_memcpy.c
+@@ -22,6 +22,14 @@
+ #include <asm/current.h>
+ #include <asm/page.h>
+ 
++#ifndef COPY_FROM_USER_THRESHOLD
++#define COPY_FROM_USER_THRESHOLD 64
++#endif
++
++#ifndef COPY_TO_USER_THRESHOLD
++#define COPY_TO_USER_THRESHOLD 64
++#endif
++
+ static int
+ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
+ {
+@@ -85,7 +93,44 @@ pin_page_for_write(const void __user *_a
+       return 1;
+ }
+ 
+-static unsigned long noinline
++static int
++pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
++{
++      unsigned long addr = (unsigned long)_addr;
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte;
++      pud_t *pud;
++      spinlock_t *ptl;
++
++      pgd = pgd_offset(current->mm, addr);
++      if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
++      {
++              return 0;
++      }
++      pud = pud_offset(pgd, addr);
++      if (unlikely(pud_none(*pud) || pud_bad(*pud)))
++      {
++              return 0;
++      }
++
++      pmd = pmd_offset(pud, addr);
++      if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
++              return 0;
++
++      pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
++      if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
++              pte_unmap_unlock(pte, ptl);
++              return 0;
++      }
++
++      *ptep = pte;
++      *ptlp = ptl;
++
++      return 1;
++}
++
++unsigned long noinline
+ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
+ {
+       unsigned long ua_flags;
+@@ -138,6 +183,54 @@ out:
+       return n;
+ }
+ 
++unsigned long noinline
++__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
++{
++      int atomic;
++
++      if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
++              memcpy(to, (const void *)from, n);
++              return 0;
++      }
++
++      /* the mmap semaphore is taken only if not in an atomic context */
++      atomic = in_atomic();
++
++      if (!atomic)
++              down_read(&current->mm->mmap_sem);
++      while (n) {
++              pte_t *pte;
++              spinlock_t *ptl;
++              int tocopy;
++
++              while (!pin_page_for_read(from, &pte, &ptl)) {
++                      char temp;
++                      if (!atomic)
++                              up_read(&current->mm->mmap_sem);
++                      if (__get_user(temp, (char __user *)from))
++                              goto out;
++                      if (!atomic)
++                              down_read(&current->mm->mmap_sem);
++              }
++
++              tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
++              if (tocopy > n)
++                      tocopy = n;
++
++              memcpy(to, (const void *)from, tocopy);
++              to += tocopy;
++              from += tocopy;
++              n -= tocopy;
++
++              pte_unmap_unlock(pte, ptl);
++      }
++      if (!atomic)
++              up_read(&current->mm->mmap_sem);
++
++out:
++      return n;
++}
++
+ unsigned long
+ arm_copy_to_user(void __user *to, const void *from, unsigned long n)
+ {
+@@ -148,7 +241,7 @@ arm_copy_to_user(void __user *to, const
+        * With frame pointer disabled, tail call optimization kicks in
+        * as well making this test almost invisible.
+        */
+-      if (n < 64) {
++      if (n < COPY_TO_USER_THRESHOLD) {
+               unsigned long ua_flags = uaccess_save_and_enable();
+               n = __copy_to_user_std(to, from, n);
+               uaccess_restore(ua_flags);
+@@ -157,6 +250,21 @@ arm_copy_to_user(void __user *to, const
+       }
+       return n;
+ }
++
++unsigned long __must_check
++arm_copy_from_user(void *to, const void __user *from, unsigned long n)
++{
++      /*
++       * This test is stubbed out of the main function above to keep
++       * the overhead for small copies low by avoiding a large
++       * register dump on the stack just to reload them right away.
++       * With frame pointer disabled, tail call optimization kicks in
++       * as well making this test almost invisible.
++       */
++      if (n < COPY_FROM_USER_THRESHOLD)
++              return __copy_from_user_std(to, from, n);
++      return __copy_from_user_memcpy(to, from, n);
++}
+       
+ static unsigned long noinline
+ __clear_user_memset(void __user *addr, unsigned long n)