1 From 49e011c979aee23801198617a0052b0b087583a6 Mon Sep 17 00:00:00 2001
2 From: Phil Elwell <phil@raspberrypi.org>
3 Date: Mon, 13 Oct 2014 11:47:53 +0100
4 Subject: [PATCH 062/114] Improve __copy_to_user and __copy_from_user
7 Provide a __copy_from_user that uses memcpy. On BCM2708, use
8 optimised memcpy/memmove/memcmp/memset implementations.
10 arch/arm/include/asm/string.h | 5 +
11 arch/arm/include/asm/uaccess.h | 1 +
12 arch/arm/lib/Makefile | 15 +-
13 arch/arm/lib/arm-mem.h | 159 ++++++++++++
14 arch/arm/lib/copy_from_user.S | 4 +-
15 arch/arm/lib/exports_rpi.c | 37 +++
16 arch/arm/lib/memcmp_rpi.S | 285 +++++++++++++++++++++
17 arch/arm/lib/memcpy_rpi.S | 59 +++++
18 arch/arm/lib/memcpymove.h | 506 +++++++++++++++++++++++++++++++++++++
19 arch/arm/lib/memmove_rpi.S | 61 +++++
20 arch/arm/lib/memset_rpi.S | 121 +++++++++
21 arch/arm/lib/uaccess_with_memcpy.c | 112 +++++++-
22 12 files changed, 1359 insertions(+), 6 deletions(-)
23 create mode 100644 arch/arm/lib/arm-mem.h
24 create mode 100644 arch/arm/lib/exports_rpi.c
25 create mode 100644 arch/arm/lib/memcmp_rpi.S
26 create mode 100644 arch/arm/lib/memcpy_rpi.S
27 create mode 100644 arch/arm/lib/memcpymove.h
28 create mode 100644 arch/arm/lib/memmove_rpi.S
29 create mode 100644 arch/arm/lib/memset_rpi.S
31 --- a/arch/arm/include/asm/string.h
32 +++ b/arch/arm/include/asm/string.h
33 @@ -24,6 +24,11 @@ extern void * memchr(const void *, int,
34 #define __HAVE_ARCH_MEMSET
35 extern void * memset(void *, int, __kernel_size_t);
37 +#ifdef CONFIG_MACH_BCM2708
38 +#define __HAVE_ARCH_MEMCMP
39 +extern int memcmp(const void *, const void *, size_t);
42 extern void __memzero(void *ptr, __kernel_size_t n);
44 #define memset(p,v,n) \
45 --- a/arch/arm/include/asm/uaccess.h
46 +++ b/arch/arm/include/asm/uaccess.h
47 @@ -475,6 +475,7 @@ do { \
50 extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n);
51 +extern unsigned long __must_check __copy_from_user_std(void *to, const void __user *from, unsigned long n);
52 extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n);
53 extern unsigned long __must_check __copy_to_user_std(void __user *to, const void *from, unsigned long n);
54 extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
55 --- a/arch/arm/lib/Makefile
56 +++ b/arch/arm/lib/Makefile
59 lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
60 csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
61 - delay.o delay-loop.o findbit.o memchr.o memcpy.o \
62 - memmove.o memset.o memzero.o setbit.o \
63 - strchr.o strrchr.o \
64 + delay.o delay-loop.o findbit.o memchr.o memzero.o \
65 + setbit.o strchr.o strrchr.o \
66 testchangebit.o testclearbit.o testsetbit.o \
67 ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
68 ucmpdi2.o lib1funcs.o div64.o \
69 io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
70 call_with_stack.o bswapsdi2.o
72 +# Choose optimised implementations for Raspberry Pi
73 +ifeq ($(CONFIG_MACH_BCM2708),y)
74 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
75 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
76 + obj-$(CONFIG_MODULES) += exports_rpi.o
77 + lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
79 + lib-y += memcpy.o memmove.o memset.o
82 mmu-y := clear_user.o copy_page.o getuser.o putuser.o
84 # the code in uaccess.S is not preemption safe and
86 +++ b/arch/arm/lib/arm-mem.h
89 +Copyright (c) 2013, Raspberry Pi Foundation
90 +Copyright (c) 2013, RISC OS Open Ltd
93 +Redistribution and use in source and binary forms, with or without
94 +modification, are permitted provided that the following conditions are met:
95 + * Redistributions of source code must retain the above copyright
96 + notice, this list of conditions and the following disclaimer.
97 + * Redistributions in binary form must reproduce the above copyright
98 + notice, this list of conditions and the following disclaimer in the
99 + documentation and/or other materials provided with the distribution.
100 + * Neither the name of the copyright holder nor the
101 + names of its contributors may be used to endorse or promote products
102 + derived from this software without specific prior written permission.
104 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
105 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
106 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
107 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
108 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
109 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
110 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
111 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
112 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
113 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
122 +.macro preload_leading_step1 backwards, ptr, base
123 +/* If the destination is already 16-byte aligned, then we need to preload
124 + * between 0 and prefetch_distance (inclusive) cache lines ahead so there
125 + * are no gaps when the inner loop starts.
134 + .rept prefetch_distance+1
137 + .set OFFSET, OFFSET-32
139 + .set OFFSET, OFFSET+32
144 +.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp
145 +/* However, if the destination is not 16-byte aligned, we may need to
146 + * preload one more cache line than that. The question we need to ask is:
147 + * are the leading bytes more than the amount by which the source
148 + * pointer will be rounded down for preloading, and if so, by how many
152 +/* Here we compare against how many bytes we are into the
153 + * cache line, counting down from the highest such address.
154 + * Effectively, we want to calculate
155 + * leading_bytes = dst&15
156 + * cacheline_offset = 31-((src-leading_bytes-1)&31)
157 + * extra_needed = leading_bytes - cacheline_offset
158 + * and test if extra_needed is <= 0, or rearranging:
159 + * leading_bytes + (src-leading_bytes-1)&31 <= 31
161 + mov tmp, base, lsl #32-5
162 + sbc tmp, tmp, leading_bytes, lsl #32-5
163 + adds tmp, tmp, leading_bytes, lsl #32-5
165 + pld [ptr, #-32*(prefetch_distance+1)]
167 +/* Effectively, we want to calculate
168 + * leading_bytes = (-dst)&15
169 + * cacheline_offset = (src+leading_bytes)&31
170 + * extra_needed = leading_bytes - cacheline_offset
171 + * and test if extra_needed is <= 0.
173 + mov tmp, base, lsl #32-5
174 + add tmp, tmp, leading_bytes, lsl #32-5
175 + rsbs tmp, tmp, leading_bytes, lsl #32-5
177 + pld [ptr, #32*(prefetch_distance+1)]
182 +.macro preload_trailing backwards, base, remain, tmp
183 + /* We need either 0, 1 or 2 extra preloads */
186 + mov tmp, tmp, lsl #32-5
188 + mov tmp, base, lsl #32-5
190 + adds tmp, tmp, remain, lsl #32-5
191 + adceqs tmp, tmp, #0
192 + /* The instruction above has two effects: ensures Z is only
193 + * set if C was clear (so Z indicates that both shifted quantities
194 + * were 0), and clears C if Z was set (so C indicates that the sum
195 + * of the shifted quantities was greater and not equal to 32) */
205 + pld [tmp, #-32*(prefetch_distance+1)]
207 + pld [tmp, #-32*prefetch_distance]
209 + pld [tmp, #32*(prefetch_distance+2)]
211 + pld [tmp, #32*(prefetch_distance+1)]
216 +.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1
219 + bic tmp0, tmp0, #31
221 + sub tmp1, base, remain, lsl #shift
223 + bic tmp0, base, #31
225 + add tmp1, base, remain, lsl #shift
228 + bic tmp1, tmp1, #31
232 + /* In this case, all the data fits in either 1 or 2 cache lines */
237 + sub tmp0, tmp0, #32
239 + add tmp0, tmp0, #32
247 --- a/arch/arm/lib/copy_from_user.S
248 +++ b/arch/arm/lib/copy_from_user.S
253 -ENTRY(__copy_from_user)
254 +ENTRY(__copy_from_user_std)
255 +WEAK(__copy_from_user)
257 #include "copy_template.S"
259 ENDPROC(__copy_from_user)
260 +ENDPROC(__copy_from_user_std)
262 .pushsection .fixup,"ax"
265 +++ b/arch/arm/lib/exports_rpi.c
268 + * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
270 + * Redistribution and use in source and binary forms, with or without
271 + * modification, are permitted provided that the following conditions
273 + * 1. Redistributions of source code must retain the above copyright
274 + * notice, this list of conditions, and the following disclaimer,
275 + * without modification.
276 + * 2. Redistributions in binary form must reproduce the above copyright
277 + * notice, this list of conditions and the following disclaimer in the
278 + * documentation and/or other materials provided with the distribution.
279 + * 3. The names of the above-listed copyright holders may not be used
280 + * to endorse or promote products derived from this software without
281 + * specific prior written permission.
283 + * ALTERNATIVELY, this software may be distributed under the terms of the
284 + * GNU General Public License ("GPL") version 2, as published by the Free
285 + * Software Foundation.
287 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
288 + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
289 + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
290 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
291 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
292 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
293 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
294 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
295 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
296 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
297 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
300 +#include <linux/kernel.h>
301 +#include <linux/module.h>
303 +EXPORT_SYMBOL(memcmp);
305 +++ b/arch/arm/lib/memcmp_rpi.S
308 +Copyright (c) 2013, Raspberry Pi Foundation
309 +Copyright (c) 2013, RISC OS Open Ltd
310 +All rights reserved.
312 +Redistribution and use in source and binary forms, with or without
313 +modification, are permitted provided that the following conditions are met:
314 + * Redistributions of source code must retain the above copyright
315 + notice, this list of conditions and the following disclaimer.
316 + * Redistributions in binary form must reproduce the above copyright
317 + notice, this list of conditions and the following disclaimer in the
318 + documentation and/or other materials provided with the distribution.
319 + * Neither the name of the copyright holder nor the
320 + names of its contributors may be used to endorse or promote products
321 + derived from this software without specific prior written permission.
323 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
324 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
325 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
326 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
327 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
328 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
329 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
330 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
331 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
332 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
335 +#include <linux/linkage.h>
336 +#include "arm-mem.h"
338 +/* Prevent the stack from becoming executable */
339 +#if defined(__linux__) && defined(__ELF__)
340 +.section .note.GNU-stack,"",%progbits
350 +.macro memcmp_process_head unaligned
352 + ldr DAT0, [S_1], #4
353 + ldr DAT1, [S_1], #4
354 + ldr DAT2, [S_1], #4
355 + ldr DAT3, [S_1], #4
357 + ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
359 + ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
362 +.macro memcmp_process_tail
370 +.macro memcmp_leading_31bytes
371 + movs DAT0, OFF, lsl #31
372 + ldrmib DAT0, [S_1], #1
373 + ldrcsh DAT1, [S_1], #2
374 + ldrmib DAT4, [S_2], #1
375 + ldrcsh DAT5, [S_2], #2
385 + movs DAT0, OFF, lsl #29
386 + ldrmi DAT0, [S_1], #4
387 + ldrcs DAT1, [S_1], #4
388 + ldrcs DAT2, [S_1], #4
389 + ldrmi DAT4, [S_2], #4
390 + ldmcsia S_2!, {DAT5, DAT6}
405 + memcmp_process_head 1
407 + memcmp_process_tail
411 +.macro memcmp_trailing_15bytes unaligned
414 + ldrcs DAT0, [S_1], #4
415 + ldrcs DAT1, [S_1], #4
417 + ldmcsia S_1!, {DAT0, DAT1}
419 + ldrmi DAT2, [S_1], #4
420 + ldmcsia S_2!, {DAT4, DAT5}
421 + ldrmi DAT6, [S_2], #4
433 + ldrcsh DAT0, [S_1], #2
435 + ldrcsh DAT4, [S_2], #2
446 +.macro memcmp_long_inner_loop unaligned
448 + memcmp_process_head unaligned
449 + pld [S_2, #prefetch_distance*32 + 16]
450 + memcmp_process_tail
451 + memcmp_process_head unaligned
453 + memcmp_process_tail
456 + /* Just before the final (prefetch_distance+1) 32-byte blocks,
457 + * deal with final preloads */
458 + preload_trailing 0, S_1, N, DAT0
459 + preload_trailing 0, S_2, N, DAT0
460 + add N, N, #(prefetch_distance+2)*32 - 16
462 + memcmp_process_head unaligned
463 + memcmp_process_tail
466 + /* Trailing words and bytes */
469 + memcmp_trailing_15bytes unaligned
470 +199: /* Reached end without detecting a difference */
473 + pop {DAT1-DAT6, pc}
476 +.macro memcmp_short_inner_loop unaligned
477 + subs N, N, #16 /* simplifies inner loop termination */
480 + memcmp_process_head unaligned
481 + memcmp_process_tail
484 +122: /* Trailing words and bytes */
487 + memcmp_trailing_15bytes unaligned
488 +199: /* Reached end without detecting a difference */
491 + pop {DAT1-DAT6, pc}
495 + * int memcmp(const void *s1, const void *s2, size_t n);
497 + * a1 = pointer to buffer 1
498 + * a2 = pointer to buffer 2
499 + * a3 = number of bytes to compare (as unsigned chars)
501 + * a1 = >0/=0/<0 if s1 >/=/< s2
504 +.set prefetch_distance, 2
520 + push {DAT1-DAT6, lr}
521 + setend be /* lowest-addressed bytes are most significant */
523 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
524 + cmp N, #(prefetch_distance+3)*32 - 1
528 + /* Adjust N so that the decrement instruction can also test for
529 + * inner loop termination. We want it to stop when there are
530 + * (prefetch_distance+1) complete blocks to go. */
531 + sub N, N, #(prefetch_distance+2)*32
532 + preload_leading_step1 0, DAT0, S_1
533 + preload_leading_step1 0, DAT1, S_2
536 + rsb OFF, S_2, #0 /* no need to AND with 15 here */
537 + preload_leading_step2 0, DAT0, S_1, OFF, DAT2
538 + preload_leading_step2 0, DAT1, S_2, OFF, DAT2
539 + memcmp_leading_31bytes
540 +154: /* Second source now cacheline (32-byte) aligned; we have at
541 + * least one prefetch to go. */
542 + /* Prefetch offset is best selected such that it lies in the
543 + * first 8 of each 32 bytes - but it's just as easy to aim for
546 + rsb OFF, OFF, #32*prefetch_distance
549 + memcmp_long_inner_loop 0
550 +140: memcmp_long_inner_loop 1
552 +170: /* Short case */
555 + preload_all 0, 0, 0, S_1, N, DAT0, DAT1
556 + preload_all 0, 0, 0, S_2, N, DAT0, DAT1
561 + ldrb DAT0, [S_1], #1
562 + ldrb DAT4, [S_2], #1
567 +174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
570 + memcmp_short_inner_loop 0
571 +140: memcmp_short_inner_loop 1
573 +200: /* Difference found: determine sign. */
577 + pop {DAT1-DAT6, pc}
593 +++ b/arch/arm/lib/memcpy_rpi.S
596 +Copyright (c) 2013, Raspberry Pi Foundation
597 +Copyright (c) 2013, RISC OS Open Ltd
598 +All rights reserved.
600 +Redistribution and use in source and binary forms, with or without
601 +modification, are permitted provided that the following conditions are met:
602 + * Redistributions of source code must retain the above copyright
603 + notice, this list of conditions and the following disclaimer.
604 + * Redistributions in binary form must reproduce the above copyright
605 + notice, this list of conditions and the following disclaimer in the
606 + documentation and/or other materials provided with the distribution.
607 + * Neither the name of the copyright holder nor the
608 + names of its contributors may be used to endorse or promote products
609 + derived from this software without specific prior written permission.
611 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
612 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
613 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
614 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
615 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
616 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
617 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
618 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
619 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
620 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
623 +#include <linux/linkage.h>
624 +#include "arm-mem.h"
625 +#include "memcpymove.h"
627 +/* Prevent the stack from becoming executable */
628 +#if defined(__linux__) && defined(__ELF__)
629 +.section .note.GNU-stack,"",%progbits
640 + * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
642 + * a1 = pointer to destination
643 + * a2 = pointer to source
644 + * a3 = number of bytes to copy
649 +.set prefetch_distance, 3
655 +++ b/arch/arm/lib/memcpymove.h
658 +Copyright (c) 2013, Raspberry Pi Foundation
659 +Copyright (c) 2013, RISC OS Open Ltd
660 +All rights reserved.
662 +Redistribution and use in source and binary forms, with or without
663 +modification, are permitted provided that the following conditions are met:
664 + * Redistributions of source code must retain the above copyright
665 + notice, this list of conditions and the following disclaimer.
666 + * Redistributions in binary form must reproduce the above copyright
667 + notice, this list of conditions and the following disclaimer in the
668 + documentation and/or other materials provided with the distribution.
669 + * Neither the name of the copyright holder nor the
670 + names of its contributors may be used to endorse or promote products
671 + derived from this software without specific prior written permission.
673 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
674 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
675 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
676 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
677 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
678 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
679 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
680 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
681 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
682 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
685 +.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
688 + mov r1, r0, lsl #32-align*8
690 + orr r1, r1, r0, lsr #align*8
693 + mov r0, r1, lsr #align*8
695 + orr r0, r0, r1, lsl #32-align*8
701 + mov r2, r0, lsl #32-align*8
703 + orr r2, r2, r1, lsr #align*8
704 + mov r1, r1, lsl #32-align*8
705 + orr r1, r1, r0, lsr #align*8
709 + mov r0, r2, lsr #align*8
711 + orr r0, r0, r1, lsl #32-align*8
712 + mov r1, r1, lsr #align*8
713 + orr r1, r1, r2, lsl #32-align*8
719 + mov r4, r0, lsl #32-align*8
721 + orr r4, r4, r3, lsr #align*8
722 + mov r3, r3, lsl #32-align*8
723 + orr r3, r3, r2, lsr #align*8
724 + mov r2, r2, lsl #32-align*8
725 + orr r2, r2, r1, lsr #align*8
726 + mov r1, r1, lsl #32-align*8
727 + orr r1, r1, r0, lsr #align*8
728 + stmdb D!, {r1, r2, r3, r4}
731 + mov r0, r4, lsr #align*8
733 + orr r0, r0, r1, lsl #32-align*8
734 + mov r1, r1, lsr #align*8
735 + orr r1, r1, r2, lsl #32-align*8
736 + mov r2, r2, lsr #align*8
737 + orr r2, r2, r3, lsl #32-align*8
738 + mov r3, r3, lsr #align*8
739 + orr r3, r3, r4, lsl #32-align*8
740 + stmia D!, {r0, r1, r2, r3}
744 + ldmdb S!, {r4, r5, r6, r7}
745 + mov r8, r0, lsl #32-align*8
746 + ldmdb S!, {r0, r1, r2, r3}
750 + orr r8, r8, r7, lsr #align*8
751 + mov r7, r7, lsl #32-align*8
752 + orr r7, r7, r6, lsr #align*8
753 + mov r6, r6, lsl #32-align*8
754 + orr r6, r6, r5, lsr #align*8
755 + mov r5, r5, lsl #32-align*8
756 + orr r5, r5, r4, lsr #align*8
757 + mov r4, r4, lsl #32-align*8
758 + orr r4, r4, r3, lsr #align*8
759 + mov r3, r3, lsl #32-align*8
760 + orr r3, r3, r2, lsr #align*8
761 + mov r2, r2, lsl #32-align*8
762 + orr r2, r2, r1, lsr #align*8
763 + mov r1, r1, lsl #32-align*8
764 + orr r1, r1, r0, lsr #align*8
765 + stmdb D!, {r5, r6, r7, r8}
766 + stmdb D!, {r1, r2, r3, r4}
768 + ldmib S!, {r1, r2, r3, r4}
769 + mov r0, r8, lsr #align*8
770 + ldmib S!, {r5, r6, r7, r8}
774 + orr r0, r0, r1, lsl #32-align*8
775 + mov r1, r1, lsr #align*8
776 + orr r1, r1, r2, lsl #32-align*8
777 + mov r2, r2, lsr #align*8
778 + orr r2, r2, r3, lsl #32-align*8
779 + mov r3, r3, lsr #align*8
780 + orr r3, r3, r4, lsl #32-align*8
781 + mov r4, r4, lsr #align*8
782 + orr r4, r4, r5, lsl #32-align*8
783 + mov r5, r5, lsr #align*8
784 + orr r5, r5, r6, lsl #32-align*8
785 + mov r6, r6, lsr #align*8
786 + orr r6, r6, r7, lsl #32-align*8
787 + mov r7, r7, lsr #align*8
788 + orr r7, r7, r8, lsl #32-align*8
789 + stmia D!, {r0, r1, r2, r3}
790 + stmia D!, {r4, r5, r6, r7}
795 +.macro memcpy_leading_15bytes backwards, align
796 + movs DAT1, DAT2, lsl #31
799 + ldrmib DAT0, [S, #-1]!
800 + ldrcsh DAT1, [S, #-2]!
801 + strmib DAT0, [D, #-1]!
802 + strcsh DAT1, [D, #-2]!
804 + ldrmib DAT0, [S], #1
805 + ldrcsh DAT1, [S], #2
806 + strmib DAT0, [D], #1
807 + strcsh DAT1, [D], #2
809 + movs DAT1, DAT2, lsl #29
811 + ldrmi DAT0, [S, #-4]!
813 + ldmcsdb S!, {DAT1, DAT2}
815 + ldrcs DAT2, [S, #-4]!
816 + ldrcs DAT1, [S, #-4]!
818 + strmi DAT0, [D, #-4]!
819 + stmcsdb D!, {DAT1, DAT2}
821 + ldrmi DAT0, [S], #4
823 + ldmcsia S!, {DAT1, DAT2}
825 + ldrcs DAT1, [S], #4
826 + ldrcs DAT2, [S], #4
828 + strmi DAT0, [D], #4
829 + stmcsia D!, {DAT1, DAT2}
833 +.macro memcpy_trailing_15bytes backwards, align
837 + ldmcsdb S!, {DAT0, DAT1}
839 + ldrcs DAT1, [S, #-4]!
840 + ldrcs DAT0, [S, #-4]!
842 + ldrmi DAT2, [S, #-4]!
843 + stmcsdb D!, {DAT0, DAT1}
844 + strmi DAT2, [D, #-4]!
847 + ldmcsia S!, {DAT0, DAT1}
849 + ldrcs DAT0, [S], #4
850 + ldrcs DAT1, [S], #4
852 + ldrmi DAT2, [S], #4
853 + stmcsia D!, {DAT0, DAT1}
854 + strmi DAT2, [D], #4
858 + ldrcsh DAT0, [S, #-2]!
859 + ldrmib DAT1, [S, #-1]
860 + strcsh DAT0, [D, #-2]!
861 + strmib DAT1, [D, #-1]
863 + ldrcsh DAT0, [S], #2
865 + strcsh DAT0, [D], #2
870 +.macro memcpy_long_inner_loop backwards, align
873 + ldr DAT0, [S, #-align]!
875 + ldr LAST, [S, #-align]!
881 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
883 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
884 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
886 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
888 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
889 + stmia D!, {DAT4, DAT5, DAT6, LAST}
892 + unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
896 + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
897 + preload_trailing backwards, S, N, OFF
898 + add N, N, #(prefetch_distance+2)*32 - 32
902 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
903 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
904 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
906 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
907 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
908 + stmia D!, {DAT4, DAT5, DAT6, LAST}
911 + unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
918 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
919 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
921 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
922 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
926 + unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
929 + /* Trailing words and bytes */
935 + memcpy_trailing_15bytes backwards, align
937 + pop {DAT3, DAT4, DAT5, DAT6, DAT7}
938 + pop {D, DAT1, DAT2, pc}
941 +.macro memcpy_medium_inner_loop backwards, align
945 + ldmdb S!, {DAT0, DAT1, DAT2, LAST}
947 + ldr LAST, [S, #-4]!
948 + ldr DAT2, [S, #-4]!
949 + ldr DAT1, [S, #-4]!
950 + ldr DAT0, [S, #-4]!
952 + stmdb D!, {DAT0, DAT1, DAT2, LAST}
955 + ldmia S!, {DAT0, DAT1, DAT2, LAST}
962 + stmia D!, {DAT0, DAT1, DAT2, LAST}
966 + /* Trailing words and bytes */
969 + memcpy_trailing_15bytes backwards, align
971 + pop {D, DAT1, DAT2, pc}
974 +.macro memcpy_short_inner_loop backwards, align
978 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
980 + ldrne LAST, [S, #-4]!
981 + ldrne DAT2, [S, #-4]!
982 + ldrne DAT1, [S, #-4]!
983 + ldrne DAT0, [S, #-4]!
985 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
988 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
990 + ldrne DAT0, [S], #4
991 + ldrne DAT1, [S], #4
992 + ldrne DAT2, [S], #4
993 + ldrne LAST, [S], #4
995 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
997 + memcpy_trailing_15bytes backwards, align
999 + pop {D, DAT1, DAT2, pc}
1002 +.macro memcpy backwards
1019 + push {D, DAT1, DAT2, lr}
1021 + .cfi_def_cfa_offset 16
1022 + .cfi_rel_offset D, 0
1025 + .cfi_undefined DAT0
1026 + .cfi_rel_offset DAT1, 4
1027 + .cfi_rel_offset DAT2, 8
1028 + .cfi_undefined LAST
1029 + .cfi_rel_offset lr, 12
1036 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1039 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
1040 + cmp N, #(prefetch_distance+3)*32 - 1
1044 + push {DAT3, DAT4, DAT5, DAT6, DAT7}
1046 + .cfi_def_cfa_offset 36
1047 + .cfi_rel_offset D, 20
1048 + .cfi_rel_offset DAT1, 24
1049 + .cfi_rel_offset DAT2, 28
1050 + .cfi_rel_offset DAT3, 0
1051 + .cfi_rel_offset DAT4, 4
1052 + .cfi_rel_offset DAT5, 8
1053 + .cfi_rel_offset DAT6, 12
1054 + .cfi_rel_offset DAT7, 16
1055 + .cfi_rel_offset lr, 32
1057 + /* Adjust N so that the decrement instruction can also test for
1058 + * inner loop termination. We want it to stop when there are
1059 + * (prefetch_distance+1) complete blocks to go. */
1060 + sub N, N, #(prefetch_distance+2)*32
1061 + preload_leading_step1 backwards, DAT0, S
1063 + /* Bug in GAS: it accepts, but mis-assembles the instruction
1064 + * ands DAT2, D, #60, 2
1065 + * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
1072 + rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
1074 + preload_leading_step2 backwards, DAT0, S, DAT2, OFF
1075 + memcpy_leading_15bytes backwards, 1
1076 +154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
1077 + /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
1081 + sub OFF, OFF, #32*(prefetch_distance+1)
1084 + rsb OFF, OFF, #32*prefetch_distance
1086 + movs DAT0, S, lsl #31
1090 + memcpy_long_inner_loop backwards, 0
1091 +155: memcpy_long_inner_loop backwards, 1
1092 +156: memcpy_long_inner_loop backwards, 2
1093 +157: memcpy_long_inner_loop backwards, 3
1095 + .cfi_def_cfa_offset 16
1096 + .cfi_rel_offset D, 0
1097 + .cfi_rel_offset DAT1, 4
1098 + .cfi_rel_offset DAT2, 8
1099 + .cfi_same_value DAT3
1100 + .cfi_same_value DAT4
1101 + .cfi_same_value DAT5
1102 + .cfi_same_value DAT6
1103 + .cfi_same_value DAT7
1104 + .cfi_rel_offset lr, 12
1106 +160: /* Medium case */
1107 + preload_all backwards, 0, 0, S, N, DAT2, OFF
1108 + sub N, N, #16 /* simplifies inner loop termination */
1115 + rsb DAT2, DAT2, #16
1117 + memcpy_leading_15bytes backwards, align
1118 +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
1121 + memcpy_medium_inner_loop backwards, 0
1122 +140: memcpy_medium_inner_loop backwards, 1
1124 +170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
1127 + preload_all backwards, 1, 0, S, N, DAT2, LAST
1133 + ldrb DAT0, [S, #-1]!
1134 + strb DAT0, [D, #-1]!
1136 + ldrb DAT0, [S], #1
1137 + strb DAT0, [D], #1
1141 +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
1144 + memcpy_short_inner_loop backwards, 0
1145 +140: memcpy_short_inner_loop backwards, 1
1164 +++ b/arch/arm/lib/memmove_rpi.S
1167 +Copyright (c) 2013, Raspberry Pi Foundation
1168 +Copyright (c) 2013, RISC OS Open Ltd
1169 +All rights reserved.
1171 +Redistribution and use in source and binary forms, with or without
1172 +modification, are permitted provided that the following conditions are met:
1173 + * Redistributions of source code must retain the above copyright
1174 + notice, this list of conditions and the following disclaimer.
1175 + * Redistributions in binary form must reproduce the above copyright
1176 + notice, this list of conditions and the following disclaimer in the
1177 + documentation and/or other materials provided with the distribution.
1178 + * Neither the name of the copyright holder nor the
1179 + names of its contributors may be used to endorse or promote products
1180 + derived from this software without specific prior written permission.
1182 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1183 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1184 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1185 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1186 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1187 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1188 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1189 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1190 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1191 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1194 +#include <linux/linkage.h>
1195 +#include "arm-mem.h"
1196 +#include "memcpymove.h"
1198 +/* Prevent the stack from becoming executable */
1199 +#if defined(__linux__) && defined(__ELF__)
1200 +.section .note.GNU-stack,"",%progbits
1205 + .object_arch armv4
1211 + * void *memmove(void *s1, const void *s2, size_t n);
1213 + * a1 = pointer to destination
1214 + * a2 = pointer to source
1215 + * a3 = number of bytes to copy
1220 +.set prefetch_distance, 3
1224 + bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
1228 +++ b/arch/arm/lib/memset_rpi.S
1231 +Copyright (c) 2013, Raspberry Pi Foundation
1232 +Copyright (c) 2013, RISC OS Open Ltd
1233 +All rights reserved.
1235 +Redistribution and use in source and binary forms, with or without
1236 +modification, are permitted provided that the following conditions are met:
1237 + * Redistributions of source code must retain the above copyright
1238 + notice, this list of conditions and the following disclaimer.
1239 + * Redistributions in binary form must reproduce the above copyright
1240 + notice, this list of conditions and the following disclaimer in the
1241 + documentation and/or other materials provided with the distribution.
1242 + * Neither the name of the copyright holder nor the
1243 + names of its contributors may be used to endorse or promote products
1244 + derived from this software without specific prior written permission.
1246 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1247 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1248 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1249 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1250 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1251 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1252 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1253 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1254 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1255 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1258 +#include <linux/linkage.h>
1259 +#include "arm-mem.h"
1261 +/* Prevent the stack from becoming executable */
1262 +#if defined(__linux__) && defined(__ELF__)
1263 +.section .note.GNU-stack,"",%progbits
1268 + .object_arch armv4
1274 + * void *memset(void *s, int c, size_t n);
1276 + * a1 = pointer to buffer to fill
1277 + * a2 = byte pattern to fill with (caller-narrowed)
1278 + * a3 = number of bytes to fill
1290 + orr DAT0, DAT0, lsl #8
1292 + orr DAT0, DAT0, lsl #16
1295 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1299 +161: sub N, N, #16 /* simplifies inner loop termination */
1300 + /* Leading words and bytes */
1303 + rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */
1304 + movs DAT2, DAT3, lsl #31
1306 + strmib DAT0, [S], #1
1308 + strcsh DAT0, [S], #2
1309 + movs DAT2, DAT3, lsl #29
1311 + strmi DAT0, [S], #4
1313 + stmcsia S!, {DAT0, DAT1}
1314 +164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
1317 + /* Now the inner loop of 16-byte stores */
1318 +165: stmia S!, {DAT0, DAT1, DAT2, DAT3}
1321 +166: /* Trailing words and bytes */
1322 + movs N, N, lsl #29
1323 + stmcsia S!, {DAT0, DAT1}
1324 + strmi DAT0, [S], #4
1326 + strcsh DAT0, [S], #2
1330 +170: /* Short case */
1337 + strb DAT0, [S], #1
1341 + stmneia S!, {DAT0, DAT1, DAT2, DAT3}
1351 --- a/arch/arm/lib/uaccess_with_memcpy.c
1352 +++ b/arch/arm/lib/uaccess_with_memcpy.c
1354 #include <asm/current.h>
1355 #include <asm/page.h>
1357 +#ifndef COPY_FROM_USER_THRESHOLD
1358 +#define COPY_FROM_USER_THRESHOLD 64
1361 +#ifndef COPY_TO_USER_THRESHOLD
1362 +#define COPY_TO_USER_THRESHOLD 64
1366 pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1368 @@ -85,7 +93,44 @@ pin_page_for_write(const void __user *_a
1372 -static unsigned long noinline
1374 +pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1376 + unsigned long addr = (unsigned long)_addr;
1383 + pgd = pgd_offset(current->mm, addr);
1384 + if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
1388 + pud = pud_offset(pgd, addr);
1389 + if (unlikely(pud_none(*pud) || pud_bad(*pud)))
1394 + pmd = pmd_offset(pud, addr);
1395 + if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
1398 + pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
1399 + if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
1400 + pte_unmap_unlock(pte, ptl);
1410 +unsigned long noinline
1411 __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
1414 @@ -135,6 +180,54 @@ out:
1418 +unsigned long noinline
1419 +__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
1423 + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
1424 + memcpy(to, (const void *)from, n);
1428 + /* the mmap semaphore is taken only if not in an atomic context */
1429 + atomic = in_atomic();
1432 + down_read(¤t->mm->mmap_sem);
1438 + while (!pin_page_for_read(from, &pte, &ptl)) {
1441 + up_read(¤t->mm->mmap_sem);
1442 + if (__get_user(temp, (char __user *)from))
1445 + down_read(¤t->mm->mmap_sem);
1448 + tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
1452 + memcpy(to, (const void *)from, tocopy);
1457 + pte_unmap_unlock(pte, ptl);
1460 + up_read(¤t->mm->mmap_sem);
1467 __copy_to_user(void __user *to, const void *from, unsigned long n)
1469 @@ -145,10 +238,25 @@ __copy_to_user(void __user *to, const vo
1470 * With frame pointer disabled, tail call optimization kicks in
1471 * as well making this test almost invisible.
1474 + if (n < COPY_TO_USER_THRESHOLD)
1475 return __copy_to_user_std(to, from, n);
1476 return __copy_to_user_memcpy(to, from, n);
1480 +__copy_from_user(void *to, const void __user *from, unsigned long n)
1483 + * This test is stubbed out of the main function above to keep
1484 + * the overhead for small copies low by avoiding a large
1485 + * register dump on the stack just to reload them right away.
1486 + * With frame pointer disabled, tail call optimization kicks in
1487 + * as well making this test almost invisible.
1489 + if (n < COPY_FROM_USER_THRESHOLD)
1490 + return __copy_from_user_std(to, from, n);
1491 + return __copy_from_user_memcpy(to, from, n);
1494 static unsigned long noinline
1495 __clear_user_memset(void __user *addr, unsigned long n)