/* Copyright (c) 2005-2013 ARM Ltd. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the company may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This file gives a basic initialisation of a Cortex-A series core. It is the bare minimum required to get Cortex-A core running with a semihosting interface. It sets up a basic 1:1 phsyical address to virtual address mapping; turns the MMU on; enables branch prediction; activates any integrated caches; enables the Advanced SIMD and VFP co-processors; and installs basic exception handlers. It does not handle peripherals, and assumes all memory is Normal. It does not change processor state from the startup privilege and security level. This has only been tested to work in ARM state. By default it assumes exception vectors are located from address 0. However, if this is not true they can be moved by defining the _rdimon_vector_base symbol. For example if you have HIVECS enabled you may pass --defsym _rdimon_vector_base=0xffff0000 on the linker command line. */ /* __ARM_ARCH_PROFILE is defined from GCC 4.8 onwards, however __ARM_ARCH_7A has been defined since 4.2 onwards, which is when v7-a support was added and hence 'A' profile support was added in the compiler. Allow for this file to be built with older compilers. */ #if defined(__ARM_ARCH_7A__) || (__ARM_ARCH_PROFILE == 'A') .syntax unified .arch armv7-a .arm @ CPU Initialisation .globl _rdimon_hw_init_hook .type _rdimon_hw_init_hook, %function _rdimon_hw_init_hook: @ Only run the code on CPU 0 - otherwise spin mrc 15, 0, r4, cr0, cr0, 5 @ Read MPIDR ands r4, r4, #15 spin: bne spin mov r10, lr @ Save LR for final return #ifdef __ARMEB__ @ Setup for Big Endian setend be mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR orr r4, r4, #(1<<25) @ Switch to Big Endian (Set SCTLR.EE) mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR #else @ Setup for Little Endian setend le mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR bic r4, r4, #(1<<25) @ Switch to LE (unset SCTLR.EE) mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR #endif bl is_a15_a7 @ For Cortex-A15 and Cortex-A7 only: @ Write zero into the ACTLR to turn everything on. itt eq moveq r4, #0 mcreq 15, 0, r4, c1, c0, 1 isb @ For Cortex-A15 and Cortex-A7 only: @ Set ACTLR:SMP bit before enabling the caches and MMU, @ or performing any cache and TLB maintenance operations. ittt eq mrceq 15, 0, r4, c1, c0, 1 @ Read ACTLR orreq r4, r4, #(1<<6) @ Enable ACTLR:SMP mcreq 15, 0, r4, c1, c0, 1 @ Write ACTLR isb @ Setup for exceptions being taken to Thumb/ARM state mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR #if defined(__thumb__) orr r4, r4, #(1 << 30) @ Enable SCTLR.TE #else bic r4, r4, #(1 << 30) @ Disable SCTLR.TE #endif mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR bl __reset_caches mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR orr r4, r4, #(1<<22) @ Enable unaligned mode bic r4, r4, #2 @ Disable alignment faults bic r4, r4, #1 @ Disable MMU mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR mov r4, #0 mcr 15, 0, r4, cr8, cr7, 0 @ Write TLBIALL - Invaliidate unified @ TLB @ Setup MMU Primary table P=V mapping. mvn r4, #0 mcr 15, 0, r4, cr3, cr0, 0 @ Write DACR mov r4, #0 @ Always use TTBR0, no LPAE mcr 15, 0, r4, cr2, cr0, 2 @ Write TTBCR adr r4, page_table_addr @ Load the base for vectors ldr r4, [r4] mrc p15, 0, r0, c0, c0, 5 @ read MPIDR tst r0, #0x80000000 @ bis[31] @ Set page table flags - there are two page table flag formats for the @ architecture. For systems without multiprocessor extensions we use 0x1 @ which is Inner cacheable/Outer non-cacheable. For systems with @ multiprocessor extensions we use 0x59 which is Inner/Outer write-back, @ no write-allocate, and cacheable. See the ARMARM-v7AR for more details. it ne addne r4, r4, #0x58 add r4, r4, #1 mcr 15, 0, r4, cr2, cr0, 0 @ Write TTBR0 mov r0, #34 @ 0x22 @ TR0 and TR1 - normal memory orr r0, r0, #(1 << 19) @ Shareable mcr 15, 0, r0, cr10, cr2, 0 @ Write PRRR movw r0, #0x33 movt r0, #0x33 mcr 15, 0, r0, cr10, cr2, 1 @ Write NMRR mrc 15, 0, r0, cr1, cr0, 0 @ Read SCTLR bic r0, r0, #(1 << 28) @ Clear TRE bit mcr 15, 0, r0, cr1, cr0, 0 @ Write SCTLR @ Now install the vector code - we move the Vector code from where it is @ in the image to be based at _rdimon_vector_base. We have to do this copy @ as the code is all PC-relative. We actually cheat and do a BX so @ that we are at a known address relatively quickly and have to move as @ little code as possible. mov r7, #(VectorCode_Limit - VectorCode) adr r5, VectorCode adr r6, vector_base_addr @ Load the base for vectors ldr r6, [r6] copy_loop: @ Do the copy ldr r4, [r5], #4 str r4, [r6], #4 subs r7, r7, #4 bne copy_loop mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR bic r4, r4, #0x1000 @ Disable I Cache bic r4, r4, #4 @ Disable D Cache orr r4, r4, #1 @ Enable MMU bic r4, r4, #(1 << 28) @ Clear TRE bit mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR mrc 15, 0, r4, cr1, cr0, 2 @ Read CPACR orr r4, r4, #0x00f00000 @ Turn on VFP Co-procs bic r4, r4, #0x80000000 @ Clear ASEDIS bit mcr 15, 0, r4, cr1, cr0, 2 @ Write CPACR isb mov r4, #0 mcr 15, 0, r4, cr7, cr5, 4 @ Flush prefetch buffer mrc 15, 0, r4, cr1, cr0, 2 @ Read CPACR ubfx r4, r4, #20, #4 @ Extract bits [20, 23) cmp r4, #0xf @ If not all set then the CPU does not itt eq @ have FP or Advanced SIMD. moveq r4, #0x40000000 @ Enable FP and Advanced SIMD mcreq 10, 7, r4, cr8, cr0, 0 @ vmsr fpexc, r4 skip_vfp_enable: bl __enable_caches @ Turn caches on bx r10 @ Return to CRT startup routine @ This enable us to be more precise about which caches we want init_cpu_client_enable_dcache: init_cpu_client_enable_icache: mov r0, #1 bx lr vector_base_addr: .word _rdimon_vector_base .weak _rdimon_vector_base page_table_addr: .word page_tables @ Vector code - must be PIC and in ARM state. VectorCode: b vector_reset b vector_undef b vector_swi b vector_prefetch b vector_dataabt b vector_reserved b vector_irq b vector_fiq vector_reset: adr sp, vector_sp_base push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} mov r4, #0 b vector_common vector_undef: adr sp, vector_sp_base push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} mov r4, #1 b vector_common vector_swi: adr sp, vector_sp_base push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} mov r4, #2 b vector_common vector_prefetch: adr sp, vector_sp_base push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} mov r4, #3 b vector_common vector_dataabt: adr sp, vector_sp_base push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} mov r4, #4 b vector_common vector_reserved: adr sp, vector_sp_base push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} mov r4, #5 b vector_common vector_irq: adr sp, vector_sp_base push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} mov r4, #6 b vector_common vector_fiq: adr sp, vector_sp_base push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} mov r4, #7 b vector_common vector_common: adr r1, vector_common_adr @ Find where we're going to ldr r1, [r1] bx r1 @ And branch there vector_common_adr: .word vector_common_2 @ Common handling code @ Vector stack .p2align 3 @ Align to 8 byte boundary boundary to @ keep ABI compatibility .fill 32, 4, 0 @ 32-entry stack is enough for vector @ handlers. vector_sp_base: VectorCode_Limit: @ End of PIC code for vectors @ Common Handling of vectors .type vector_common_2, %function vector_common_2: mrs r1, APSR mrs r2, SPSR push {r1, r2} @ Save PSRs @ Output the vector we have caught bl out_nl adr r0, which_vector bl out_string adr r0, vector_names mov r1, #11 mla r0, r4, r1, r0 bl out_string bl out_nl @ Dump the registers adrl r6, register_names mov r7, #0 dump_r_loop: mov r0, r6 bl out_string add r6, r6, #6 ldr r0, [sp, r7, lsl #2] bl out_word bl out_nl add r7, r7, #1 cmp r7, #16 blt dump_r_loop adr r0, end bl out_string @ And exit mov r0, #24 orr r1, r4, #0x20000 svc 0x00123456 @ Output the string in r0 out_string: push {lr} mov r1, r0 mov r0, #4 svc 0x00123456 pop {pc} @ Output a New-line out_nl: mov r0, #10 @ Fallthrough @ Output the character in r0 out_char: push {lr} strb r0, [sp, #-4]! mov r0, #3 mov r1, sp svc 0x00123456 add sp, sp, #4 pop {pc} @ Output the value of r0 as a hex-word out_word: push {r4, r5, r6, lr} mov r4, r0 mov r5, #28 adr r6, hexchars word_loop: lsr r0, r4, r5 and r0, r0, #15 ldrb r0, [r6, r0] bl out_char subs r5, r5, #4 bpl word_loop pop {r4, r5, r6, pc} hexchars: .ascii "0123456789abcdef" which_vector: .asciz "Hit vector:" end: .asciz "End.\n" vector_names: .asciz "reset " .asciz "undef " .asciz "swi " .asciz "prefetch " .asciz "data abort" .asciz "reserved " .asciz "irq " .asciz "fiq " register_names: .asciz "apsr " .asciz "spsr " .asciz "r0 " .asciz "r1 " .asciz "r2 " .asciz "r3 " .asciz "r4 " .asciz "r5 " .asciz "r6 " .asciz "r7 " .asciz "r8 " .asciz "r9 " .asciz "r10 " .asciz "r11 " .asciz "r12 " .asciz "r14 " .p2align 3 @ Enable the caches __enable_caches: mov r0, #0 mcr 15, 0, r0, cr8, cr7, 0 @ Invalidate all unified-TLB mov r0, #0 mcr 15, 0, r0, cr7, cr5, 6 @ Invalidate branch predictor mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR orr r4, r4, #0x800 @ Enable branch predictor mcr 15, 0, r4, cr1, cr0, 0 @ Set SCTLR mov r5, lr @ Save LR as we're going to BL mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR bl init_cpu_client_enable_icache cmp r0, #0 it ne orrne r4, r4, #0x1000 @ Enable I-Cache bl init_cpu_client_enable_dcache cmp r0, #0 it ne orrne r4, r4, #4 mcr 15, 0, r4, cr1, cr0, 0 @ Enable D-Cache bx r5 @ Return __reset_caches: mov ip, lr @ Save LR mov r0, #0 mcr 15, 0, r0, cr7, cr5, 6 @ Invalidate branch predictor mrc 15, 0, r6, cr1, cr0, 0 @ Read SCTLR mrc 15, 0, r0, cr1, cr0, 0 @ Read SCTLR! bic r0, r0, #0x1000 @ Disable I cache mcr 15, 0, r0, cr1, cr0, 0 @ Write SCTLR mrc 15, 1, r0, cr0, cr0, 1 @ Read CLIDR tst r0, #3 @ Harvard Cache? mov r0, #0 it ne mcrne 15, 0, r0, cr7, cr5, 0 @ Invalidate Instruction Cache? mrc 15, 0, r1, cr1, cr0, 0 @ Read SCTLR (again!) orr r1, r1, #0x800 @ Enable branch predictor @ If we're not enabling caches we have @ no more work to do. bl init_cpu_client_enable_icache cmp r0, #0 it ne orrne r1, r1, #0x1000 @ Enable I-Cache now - @ We actually only do this if we have a @ Harvard style cache. it eq bleq init_cpu_client_enable_dcache itt eq cmpeq r0, #0 beq Finished1 mcr 15, 0, r1, cr1, cr0, 0 @ Write SCTLR (turn on Branch predictor & I-cache) mrc 15, 1, r0, cr0, cr0, 1 @ Read CLIDR ands r3, r0, #0x7000000 lsr r3, r3, #23 @ Total cache levels << 1 beq Finished1 mov lr, #0 @ lr = cache level << 1 Loop11: mrc 15, 1, r0, cr0, cr0, 1 @ Read CLIDR add r2, lr, lr, lsr #1 @ r2 holds cache 'set' position lsr r1, r0, r2 @ Bottom 3-bits are Ctype for this level and r1, r1, #7 @ Get those 3-bits alone cmp r1, #2 blt Skip1 @ No cache or only I-Cache at this level mcr 15, 2, lr, cr0, cr0, 0 @ Write CSSELR mov r1, #0 isb sy mrc 15, 1, r1, cr0, cr0, 0 @ Read CCSIDR and r2, r1, #7 @ Extract line length field add r2, r2, #4 @ Add 4 for the line length offset (log2 16 bytes) movw r0, #0x3ff ands r0, r0, r1, lsr #3 @ r0 is the max number on the way size clz r4, r0 @ r4 is the bit position of the way size increment movw r5, #0x7fff ands r5, r5, r1, lsr #13 @ r5 is the max number of the index size (right aligned) Loop21: mov r7, r0 @ r7 working copy of max way size Loop31: orr r1, lr, r7, lsl r4 @ factor in way number and cache number orr r1, r1, r5, lsl r2 @ factor in set number tst r6, #4 @ D-Cache on? ite eq mcreq 15, 0, r1, cr7, cr6, 2 @ No - invalidate by set/way mcrne 15, 0, r1, cr7, cr14, 2 @ yes - clean + invalidate by set/way subs r7, r7, #1 @ Decrement way number bge Loop31 subs r5, r5, #1 @ Decrement set number bge Loop21 Skip1: add lr, lr, #2 @ increment cache number cmp r3, lr bgt Loop11 Finished1: @ Now we know the caches are clean we can: mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR bic r4, r4, #4 @ Disable D-Cache mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR mov r4, #0 mcr 15, 0, r4, cr7, cr5, 6 @ Write BPIALL bx ip @ Return @ Set Z if this is a Cortex-A15 or Cortex_A7 @ Other flags corrupted is_a15_a7: mrc 15, 0, r8, c0, c0, 0 movw r9, #0xfff0 movt r9, #0xff0f and r8, r8, r9 movw r9, #0xc0f0 movt r9, #0x410f cmp r8, r9 movw r9, #0xc070 movt r9, #0x410f it ne cmpne r8, r9 bx lr @ Descriptor type: Section @ Bufferable: True @ Cacheable: True @ Execute Never: False @ Domain: 0 @ Impl. Defined: 0 @ Access: 0/11 Full access @ TEX: 001 @ Shareable: False @ Not Global: False @ Supersection: False #define PT(X) \ .word X; #define PT2(X) \ PT(X) PT(X + 0x100000) PT(X + 0x200000) PT(X + 0x300000) #define PT3(X) \ PT2(X) PT2(X + 0x400000) PT2(X + 0x800000) PT2(X + 0xc00000) #define PT4(X) \ PT3(X) PT3(X + 0x1000000) PT3(X + 0x2000000) PT3(X + 0x3000000) #define PT5(X) \ PT4(X) PT4(X + 0x4000000) PT4(X + 0x8000000) PT4(X + 0xc000000) #define PT6(X) \ PT5(X) PT5(X + 0x10000000) PT5(X + 0x20000000) PT5(X + 0x30000000) #define PT7(X) \ PT6(X) PT6(X + 0x40000000) PT6(X + 0x80000000) PT6(X + 0xc0000000) .section page_tables_section, "aw", %progbits .p2align 14 page_tables: PT7(0x1c0e) #endif //#if defined(__ARM_ARCH_7A__) || __ARM_ARCH_PROFILE == 'A'