forked from nrfconnect/sdk-zephyr
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathx86_mmu.c
More file actions
2249 lines (1958 loc) · 59.1 KB
/
x86_mmu.c
File metadata and controls
2249 lines (1958 loc) · 59.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2011-2014 Wind River Systems, Inc.
* Copyright (c) 2017-2020 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <zephyr/kernel.h>
#include <zephyr/arch/x86/mmustructs.h>
#include <zephyr/kernel/mm.h>
#include <zephyr/sys/__assert.h>
#include <zephyr/sys/check.h>
#include <zephyr/logging/log.h>
#include <errno.h>
#include <ctype.h>
#include <zephyr/spinlock.h>
#include <kernel_arch_func.h>
#include <x86_mmu.h>
#include <zephyr/init.h>
#include <kernel_internal.h>
#include <mmu.h>
#include <zephyr/drivers/interrupt_controller/loapic.h>
#include <zephyr/arch/x86/memmap.h>
LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
/* We will use some ignored bits in the PTE to backup permission settings
* when the mapping was made. This is used to un-apply memory domain memory
* partitions to page tables when the partitions are removed.
*/
#define MMU_RW_ORIG MMU_IGNORED0
#define MMU_US_ORIG MMU_IGNORED1
#define MMU_XD_ORIG MMU_IGNORED2
/* Bits in the PTE that form the set of permission bits, when resetting */
#define MASK_PERM (MMU_RW | MMU_US | MMU_XD)
/* When we want to set up a new mapping, discarding any previous state */
#define MASK_ALL (~((pentry_t)0U))
/* Bits to set at mapping time for particular permissions. We set the actual
* page table bit effecting the policy and also the backup bit.
*/
#define ENTRY_RW (MMU_RW | MMU_RW_ORIG)
#define ENTRY_US (MMU_US | MMU_US_ORIG)
#define ENTRY_XD (MMU_XD | MMU_XD_ORIG)
/* Bit position which is always zero in a PTE. We'll use the PAT bit.
* This helps disambiguate PTEs that do not have the Present bit set (MMU_P):
* - If the entire entry is zero, it's an un-mapped virtual page
* - If PTE_ZERO is set, we flipped this page due to KPTI
* - Otherwise, this was a page-out
*/
#define PTE_ZERO MMU_PAT
/* Protects x86_domain_list and serializes instantiation of intermediate
* paging structures.
*/
__pinned_bss
static struct k_spinlock x86_mmu_lock;
#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
/* List of all active and initialized memory domains. This is used to make
* sure all memory mappings are the same across all page tables when invoking
* range_map()
*/
__pinned_bss
static sys_slist_t x86_domain_list;
#endif
/*
* Definitions for building an ontology of paging levels and capabilities
* at each level
*/
/* Data structure describing the characteristics of a particular paging
* level
*/
struct paging_level {
/* What bits are used to store physical address */
pentry_t mask;
/* Number of entries in this paging structure */
size_t entries;
/* How many bits to right-shift a virtual address to obtain the
* appropriate entry within this table.
*
* The memory scope of each entry in this table is 1 << shift.
*/
unsigned int shift;
#ifdef CONFIG_EXCEPTION_DEBUG
/* Name of this level, for debug purposes */
const char *name;
#endif
};
/* Flags for all entries in intermediate paging levels.
* Fortunately, the same bits are set for all intermediate levels for all
* three paging modes.
*
* Obviously P is set.
*
* We want RW and US bit always set; actual access control will be
* done at the leaf level.
*
* XD (if supported) always 0. Disabling execution done at leaf level.
*
* PCD/PWT always 0. Caching properties again done at leaf level.
*/
#define INT_FLAGS (MMU_P | MMU_RW | MMU_US)
/* Paging level ontology for the selected paging mode.
*
* See Figures 4-4, 4-7, 4-11 in the Intel SDM, vol 3A
*/
__pinned_rodata
static const struct paging_level paging_levels[] = {
#ifdef CONFIG_X86_64
/* Page Map Level 4 */
{
.mask = 0x7FFFFFFFFFFFF000ULL,
.entries = 512U,
.shift = 39U,
#ifdef CONFIG_EXCEPTION_DEBUG
.name = "PML4"
#endif
},
#endif /* CONFIG_X86_64 */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
/* Page Directory Pointer Table */
{
.mask = 0x7FFFFFFFFFFFF000ULL,
#ifdef CONFIG_X86_64
.entries = 512U,
#else
/* PAE version */
.entries = 4U,
#endif
.shift = 30U,
#ifdef CONFIG_EXCEPTION_DEBUG
.name = "PDPT"
#endif
},
#endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
/* Page Directory */
{
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
.mask = 0x7FFFFFFFFFFFF000ULL,
.entries = 512U,
.shift = 21U,
#else
/* 32-bit */
.mask = 0xFFFFF000U,
.entries = 1024U,
.shift = 22U,
#endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
#ifdef CONFIG_EXCEPTION_DEBUG
.name = "PD"
#endif
},
/* Page Table */
{
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
.mask = 0x07FFFFFFFFFFF000ULL,
.entries = 512U,
.shift = 12U,
#else
/* 32-bit */
.mask = 0xFFFFF000U,
.entries = 1024U,
.shift = 12U,
#endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
#ifdef CONFIG_EXCEPTION_DEBUG
.name = "PT"
#endif
}
};
#define NUM_LEVELS ARRAY_SIZE(paging_levels)
#define PTE_LEVEL (NUM_LEVELS - 1)
#define PDE_LEVEL (NUM_LEVELS - 2)
/*
* Macros for reserving space for page tables
*
* We need to reserve a block of memory equal in size to the page tables
* generated by gen_mmu.py so that memory addresses do not shift between
* build phases. These macros ultimately specify INITIAL_PAGETABLE_SIZE.
*/
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#ifdef CONFIG_X86_64
#define NUM_PML4_ENTRIES 512U
#define NUM_PDPT_ENTRIES 512U
#else
#define NUM_PDPT_ENTRIES 4U
#endif /* CONFIG_X86_64 */
#define NUM_PD_ENTRIES 512U
#define NUM_PT_ENTRIES 512U
#else
#define NUM_PD_ENTRIES 1024U
#define NUM_PT_ENTRIES 1024U
#endif /* !CONFIG_X86_64 && !CONFIG_X86_PAE */
/* Memory range covered by an instance of various table types */
#define PT_AREA ((uintptr_t)(CONFIG_MMU_PAGE_SIZE * NUM_PT_ENTRIES))
#define PD_AREA (PT_AREA * NUM_PD_ENTRIES)
#ifdef CONFIG_X86_64
#define PDPT_AREA (PD_AREA * NUM_PDPT_ENTRIES)
#endif
#define VM_ADDR CONFIG_KERNEL_VM_BASE
#define VM_SIZE CONFIG_KERNEL_VM_SIZE
/* Define a range [PT_START, PT_END) which is the memory range
* covered by all the page tables needed for the address space
*/
#define PT_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PT_AREA))
#define PT_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PT_AREA))
/* Number of page tables needed to cover address space. Depends on the specific
* bounds, but roughly 1 page table per 2MB of RAM
*/
#define NUM_PT ((PT_END - PT_START) / PT_AREA)
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
/* Same semantics as above, but for the page directories needed to cover
* system RAM.
*/
#define PD_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PD_AREA))
#define PD_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PD_AREA))
/* Number of page directories needed to cover the address space. Depends on the
* specific bounds, but roughly 1 page directory per 1GB of RAM
*/
#define NUM_PD ((PD_END - PD_START) / PD_AREA)
#else
/* 32-bit page tables just have one toplevel page directory */
#define NUM_PD 1
#endif
#ifdef CONFIG_X86_64
/* Same semantics as above, but for the page directory pointer tables needed
* to cover the address space. On 32-bit there is just one 4-entry PDPT.
*/
#define PDPT_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PDPT_AREA))
#define PDPT_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PDPT_AREA))
/* Number of PDPTs needed to cover the address space. 1 PDPT per 512GB of VM */
#define NUM_PDPT ((PDPT_END - PDPT_START) / PDPT_AREA)
/* All pages needed for page tables, using computed values plus one more for
* the top-level PML4
*/
#define NUM_TABLE_PAGES (NUM_PT + NUM_PD + NUM_PDPT + 1)
#else /* !CONFIG_X86_64 */
/* Number of pages we need to reserve in the stack for per-thread page tables */
#define NUM_TABLE_PAGES (NUM_PT + NUM_PD)
#endif /* CONFIG_X86_64 */
#define INITIAL_PTABLE_PAGES \
(NUM_TABLE_PAGES + CONFIG_X86_EXTRA_PAGE_TABLE_PAGES)
#ifdef CONFIG_X86_PAE
/* Toplevel PDPT wasn't included as it is not a page in size */
#define INITIAL_PTABLE_SIZE \
((INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE) + 0x20)
#else
#define INITIAL_PTABLE_SIZE \
(INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE)
#endif
/* "dummy" pagetables for the first-phase build. The real page tables
* are produced by gen-mmu.py based on data read in zephyr-prebuilt.elf,
* and this dummy array is discarded.
*/
Z_GENERIC_SECTION(.dummy_pagetables)
static __used char dummy_pagetables[INITIAL_PTABLE_SIZE];
/*
* Utility functions
*/
/* For a table at a particular level, get the entry index that corresponds to
* the provided virtual address
*/
__pinned_func
static inline int get_index(void *virt, int level)
{
return (((uintptr_t)virt >> paging_levels[level].shift) %
paging_levels[level].entries);
}
__pinned_func
static inline pentry_t *get_entry_ptr(pentry_t *ptables, void *virt, int level)
{
return &ptables[get_index(virt, level)];
}
__pinned_func
static inline pentry_t get_entry(pentry_t *ptables, void *virt, int level)
{
return ptables[get_index(virt, level)];
}
/* Get the physical memory address associated with this table entry */
__pinned_func
static inline uintptr_t get_entry_phys(pentry_t entry, int level)
{
return entry & paging_levels[level].mask;
}
/* Return the virtual address of a linked table stored in the provided entry */
__pinned_func
static inline pentry_t *next_table(pentry_t entry, int level)
{
return k_mem_virt_addr(get_entry_phys(entry, level));
}
/* Number of table entries at this level */
__pinned_func
static inline size_t get_num_entries(int level)
{
return paging_levels[level].entries;
}
/* 4K for everything except PAE PDPTs */
__pinned_func
static inline size_t table_size(int level)
{
return get_num_entries(level) * sizeof(pentry_t);
}
/* For a table at a particular level, size of the amount of virtual memory
* that an entry within the table covers
*/
__pinned_func
static inline size_t get_entry_scope(int level)
{
return (1UL << paging_levels[level].shift);
}
/* For a table at a particular level, size of the amount of virtual memory
* that this entire table covers
*/
__pinned_func
static inline size_t get_table_scope(int level)
{
return get_entry_scope(level) * get_num_entries(level);
}
/* Must have checked Present bit first! Non-present entries may have OS data
* stored in any other bits
*/
__pinned_func
static inline bool is_leaf(int level, pentry_t entry)
{
if (level == PTE_LEVEL) {
/* Always true for PTE */
return true;
}
return ((entry & MMU_PS) != 0U);
}
/* This does NOT (by design) un-flip KPTI PTEs, it's just the raw PTE value */
__pinned_func
static inline void pentry_get(int *paging_level, pentry_t *val,
pentry_t *ptables, void *virt)
{
pentry_t *table = ptables;
for (int level = 0; level < NUM_LEVELS; level++) {
pentry_t entry = get_entry(table, virt, level);
if ((entry & MMU_P) == 0 || is_leaf(level, entry)) {
*val = entry;
if (paging_level != NULL) {
*paging_level = level;
}
break;
} else {
table = next_table(entry, level);
}
}
}
__pinned_func
static inline void tlb_flush_page(void *addr)
{
/* Invalidate TLB entries corresponding to the page containing the
* specified address
*/
char *page = (char *)addr;
__asm__ ("invlpg %0" :: "m" (*page));
}
#ifdef CONFIG_X86_KPTI
__pinned_func
static inline bool is_flipped_pte(pentry_t pte)
{
return (pte & MMU_P) == 0 && (pte & PTE_ZERO) != 0;
}
#endif
#if defined(CONFIG_SMP)
__pinned_func
void z_x86_tlb_ipi(const void *arg)
{
uintptr_t ptables_phys;
ARG_UNUSED(arg);
#ifdef CONFIG_X86_KPTI
/* We're always on the kernel's set of page tables in this context
* if KPTI is turned on
*/
ptables_phys = z_x86_cr3_get();
__ASSERT(ptables_phys == k_mem_phys_addr(&z_x86_kernel_ptables), "");
#else
/* We might have been moved to another memory domain, so always invoke
* z_x86_thread_page_tables_get() instead of using current CR3 value.
*/
ptables_phys = k_mem_phys_addr(z_x86_thread_page_tables_get(_current));
#endif
/*
* In the future, we can consider making this smarter, such as
* propagating which page tables were modified (in case they are
* not active on this CPU) or an address range to call
* tlb_flush_page() on.
*/
LOG_DBG("%s on CPU %d\n", __func__, arch_curr_cpu()->id);
z_x86_cr3_set(ptables_phys);
}
/* NOTE: This is not synchronous and the actual flush takes place some short
* time after this exits.
*/
__pinned_func
static inline void tlb_shootdown(void)
{
z_loapic_ipi(0, LOAPIC_ICR_IPI_OTHERS, CONFIG_TLB_IPI_VECTOR);
}
#endif /* CONFIG_SMP */
__pinned_func
static inline void assert_addr_aligned(uintptr_t addr)
{
#if __ASSERT_ON
__ASSERT((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
"unaligned address 0x%" PRIxPTR, addr);
#else
ARG_UNUSED(addr);
#endif
}
__pinned_func
static inline bool is_addr_aligned(uintptr_t addr)
{
if ((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) {
return true;
} else {
return false;
}
}
__pinned_func
static inline void assert_virt_addr_aligned(void *addr)
{
assert_addr_aligned((uintptr_t)addr);
}
__pinned_func
static inline bool is_virt_addr_aligned(void *addr)
{
return is_addr_aligned((uintptr_t)addr);
}
__pinned_func
static inline void assert_size_aligned(size_t size)
{
#if __ASSERT_ON
__ASSERT((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
"unaligned size %zu", size);
#else
ARG_UNUSED(size);
#endif
}
__pinned_func
static inline bool is_size_aligned(size_t size)
{
if ((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) {
return true;
} else {
return false;
}
}
__pinned_func
static inline void assert_region_page_aligned(void *addr, size_t size)
{
assert_virt_addr_aligned(addr);
assert_size_aligned(size);
}
__pinned_func
static inline bool is_region_page_aligned(void *addr, size_t size)
{
if (!is_virt_addr_aligned(addr)) {
return false;
}
return is_size_aligned(size);
}
/*
* Debug functions. All conditionally compiled with CONFIG_EXCEPTION_DEBUG.
*/
#ifdef CONFIG_EXCEPTION_DEBUG
/* Add colors to page table dumps to indicate mapping type */
#define COLOR_PAGE_TABLES 1
#if COLOR_PAGE_TABLES
#define ANSI_DEFAULT "\x1B" "[0m"
#define ANSI_RED "\x1B" "[1;31m"
#define ANSI_GREEN "\x1B" "[1;32m"
#define ANSI_YELLOW "\x1B" "[1;33m"
#define ANSI_BLUE "\x1B" "[1;34m"
#define ANSI_MAGENTA "\x1B" "[1;35m"
#define ANSI_CYAN "\x1B" "[1;36m"
#define ANSI_GREY "\x1B" "[1;90m"
#define COLOR(x) printk(_CONCAT(ANSI_, x))
#else
#define COLOR(x) do { } while (false)
#endif
__pinned_func
static char get_entry_code(pentry_t value)
{
char ret;
if (value == 0U) {
/* Unmapped entry */
ret = '.';
} else {
if ((value & MMU_RW) != 0U) {
/* Writable page */
if ((value & MMU_XD) != 0U) {
/* RW */
ret = 'w';
} else {
/* RWX */
ret = 'a';
}
} else {
if ((value & MMU_XD) != 0U) {
/* R */
ret = 'r';
} else {
/* RX */
ret = 'x';
}
}
if ((value & MMU_US) != 0U) {
/* Uppercase indicates user mode access */
ret = toupper((unsigned char)ret);
}
}
return ret;
}
__pinned_func
static void print_entries(pentry_t entries_array[], uint8_t *base, int level,
size_t count)
{
int column = 0;
for (int i = 0; i < count; i++) {
pentry_t entry = entries_array[i];
uintptr_t phys = get_entry_phys(entry, level);
uintptr_t virt =
(uintptr_t)base + (get_entry_scope(level) * i);
if ((entry & MMU_P) != 0U) {
if (is_leaf(level, entry)) {
if (phys == virt) {
/* Identity mappings */
COLOR(YELLOW);
} else if (phys + K_MEM_VIRT_OFFSET == virt) {
/* Permanent RAM mappings */
COLOR(GREEN);
} else {
/* General mapped pages */
COLOR(CYAN);
}
} else {
/* Intermediate entry */
COLOR(MAGENTA);
}
} else {
if (is_leaf(level, entry)) {
if (entry == 0U) {
/* Unmapped */
COLOR(GREY);
#ifdef CONFIG_X86_KPTI
} else if (is_flipped_pte(entry)) {
/* KPTI, un-flip it */
COLOR(BLUE);
entry = ~entry;
phys = get_entry_phys(entry, level);
if (phys == virt) {
/* Identity mapped */
COLOR(CYAN);
} else {
/* Non-identity mapped */
COLOR(BLUE);
}
#endif
} else {
/* Paged out */
COLOR(RED);
}
} else {
/* Un-mapped intermediate entry */
COLOR(GREY);
}
}
printk("%c", get_entry_code(entry));
column++;
if (column == 64) {
column = 0;
printk("\n");
}
}
COLOR(DEFAULT);
if (column != 0) {
printk("\n");
}
}
__pinned_func
static void dump_ptables(pentry_t *table, uint8_t *base, int level)
{
const struct paging_level *info = &paging_levels[level];
#ifdef CONFIG_X86_64
/* Account for the virtual memory "hole" with sign-extension */
if (((uintptr_t)base & BITL(47)) != 0) {
base = (uint8_t *)((uintptr_t)base | (0xFFFFULL << 48));
}
#endif
printk("%s at %p (0x%" PRIxPTR "): ", info->name, table,
k_mem_phys_addr(table));
if (level == 0) {
printk("entire address space\n");
} else {
printk("for %p - %p\n", base,
base + get_table_scope(level) - 1);
}
print_entries(table, base, level, info->entries);
/* Check if we're a page table */
if (level == PTE_LEVEL) {
return;
}
/* Dump all linked child tables */
for (int j = 0; j < info->entries; j++) {
pentry_t entry = table[j];
pentry_t *next;
if ((entry & MMU_P) == 0U ||
(entry & MMU_PS) != 0U) {
/* Not present or big page, skip */
continue;
}
next = next_table(entry, level);
dump_ptables(next, base + (j * get_entry_scope(level)),
level + 1);
}
}
__pinned_func
void z_x86_dump_page_tables(pentry_t *ptables)
{
dump_ptables(ptables, NULL, 0);
}
/* Enable to dump out the kernel's page table right before main() starts,
* sometimes useful for deep debugging. May overwhelm twister.
*/
#define DUMP_PAGE_TABLES 0
#if DUMP_PAGE_TABLES
__pinned_func
static int dump_kernel_tables(void)
{
z_x86_dump_page_tables(z_x86_kernel_ptables);
return 0;
}
SYS_INIT(dump_kernel_tables, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT);
#endif
__pinned_func
static void str_append(char **buf, size_t *size, const char *str)
{
int ret = snprintk(*buf, *size, "%s", str);
if (ret >= *size) {
/* Truncated */
*size = 0U;
} else {
*size -= ret;
*buf += ret;
}
}
__pinned_func
static void dump_entry(int level, void *virt, pentry_t entry)
{
const struct paging_level *info = &paging_levels[level];
char buf[24] = { 0 };
char *pos = buf;
size_t sz = sizeof(buf);
uint8_t *virtmap = (uint8_t *)ROUND_DOWN(virt, get_entry_scope(level));
#define DUMP_BIT(bit) do { \
if ((entry & MMU_##bit) != 0U) { \
str_append(&pos, &sz, #bit " "); \
} \
} while (false)
DUMP_BIT(RW);
DUMP_BIT(US);
DUMP_BIT(PWT);
DUMP_BIT(PCD);
DUMP_BIT(A);
DUMP_BIT(D);
DUMP_BIT(G);
DUMP_BIT(XD);
LOG_ERR("%sE: %p -> " PRI_ENTRY ": %s", info->name,
virtmap, entry & info->mask, buf);
#undef DUMP_BIT
}
__pinned_func
void z_x86_pentry_get(int *paging_level, pentry_t *val, pentry_t *ptables,
void *virt)
{
pentry_get(paging_level, val, ptables, virt);
}
/*
* Debug function for dumping out MMU table information to the LOG for a
* specific virtual address, such as when we get an unexpected page fault.
*/
__pinned_func
void z_x86_dump_mmu_flags(pentry_t *ptables, void *virt)
{
pentry_t entry = 0;
int level = 0;
pentry_get(&level, &entry, ptables, virt);
if ((entry & MMU_P) == 0) {
LOG_ERR("%sE: not present", paging_levels[level].name);
} else {
dump_entry(level, virt, entry);
}
}
#endif /* CONFIG_EXCEPTION_DEBUG */
/* Reset permissions on a PTE to original state when the mapping was made */
__pinned_func
static inline pentry_t reset_pte(pentry_t old_val)
{
pentry_t new_val;
/* Clear any existing state in permission bits */
new_val = old_val & (~K_MEM_PARTITION_PERM_MASK);
/* Now set permissions based on the stashed original values */
if ((old_val & MMU_RW_ORIG) != 0) {
new_val |= MMU_RW;
}
if ((old_val & MMU_US_ORIG) != 0) {
new_val |= MMU_US;
}
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
if ((old_val & MMU_XD_ORIG) != 0) {
new_val |= MMU_XD;
}
#endif
return new_val;
}
/* Wrapper functions for some gross stuff we have to do for Kernel
* page table isolation. If these are User mode page tables, the user bit
* isn't set, and this is not the shared page, all the bits in the PTE
* are flipped. This serves three purposes:
* - The page isn't present, implementing page table isolation
* - Flipping the physical address bits cheaply mitigates L1TF
* - State is preserved; to get original PTE, just complement again
*/
__pinned_func
static inline pentry_t pte_finalize_value(pentry_t val, bool user_table,
int level)
{
#ifdef CONFIG_X86_KPTI
static const uintptr_t shared_phys_addr =
K_MEM_PHYS_ADDR(POINTER_TO_UINT(&z_shared_kernel_page_start));
if (user_table && (val & MMU_US) == 0 && (val & MMU_P) != 0 &&
get_entry_phys(val, level) != shared_phys_addr) {
val = ~val;
}
#else
ARG_UNUSED(user_table);
ARG_UNUSED(level);
#endif
return val;
}
/* Atomic functions for modifying PTEs. These don't map nicely to Zephyr's
* atomic API since the only types supported are 'int' and 'void *' and
* the size of pentry_t depends on other factors like PAE.
*/
#ifndef CONFIG_X86_PAE
/* Non-PAE, pentry_t is same size as void ptr so use atomic_ptr_* APIs */
__pinned_func
static inline pentry_t atomic_pte_get(const pentry_t *target)
{
return (pentry_t)atomic_ptr_get((const atomic_ptr_t *)target);
}
__pinned_func
static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
pentry_t new_value)
{
return atomic_ptr_cas((atomic_ptr_t *)target, (void *)old_value,
(void *)new_value);
}
#else
/* Atomic builtins for 64-bit values on 32-bit x86 require floating point.
* Don't do this, just lock local interrupts. Needless to say, this
* isn't workable if someone ever adds SMP to the 32-bit x86 port.
*/
BUILD_ASSERT(!IS_ENABLED(CONFIG_SMP));
__pinned_func
static inline pentry_t atomic_pte_get(const pentry_t *target)
{
return *target;
}
__pinned_func
static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
pentry_t new_value)
{
bool ret = false;
int key = arch_irq_lock();
if (*target == old_value) {
*target = new_value;
ret = true;
}
arch_irq_unlock(key);
return ret;
}
#endif /* CONFIG_X86_PAE */
/* Indicates that the target page tables will be used by user mode threads.
* This only has implications for CONFIG_X86_KPTI where user thread facing
* page tables need nearly all pages that don't have the US bit to also
* not be Present.
*/
#define OPTION_USER BIT(0)
/* Indicates that the operation requires TLBs to be flushed as we are altering
* existing mappings. Not needed for establishing new mappings
*/
#define OPTION_FLUSH BIT(1)
/* Indicates that each PTE's permission bits should be restored to their
* original state when the memory was mapped. All other bits in the PTE are
* preserved.
*/
#define OPTION_RESET BIT(2)
/* Indicates that the mapping will need to be cleared entirely. This is
* mainly used for unmapping the memory region.
*/
#define OPTION_CLEAR BIT(3)
/**
* Atomically update bits in a page table entry
*
* This is atomic with respect to modifications by other CPUs or preempted
* contexts, which can be very important when making decisions based on
* the PTE's prior "dirty" state.
*
* @param pte Pointer to page table entry to update
* @param update_val Updated bits to set/clear in PTE. Ignored with
* OPTION_RESET or OPTION_CLEAR.
* @param update_mask Which bits to modify in the PTE. Ignored with
* OPTION_RESET or OPTION_CLEAR.
* @param options Control flags
* @retval Old PTE value
*/
__pinned_func
static inline pentry_t pte_atomic_update(pentry_t *pte, pentry_t update_val,
pentry_t update_mask,
uint32_t options)
{
bool user_table = (options & OPTION_USER) != 0U;
bool reset = (options & OPTION_RESET) != 0U;
bool clear = (options & OPTION_CLEAR) != 0U;
pentry_t old_val, new_val;
do {
old_val = atomic_pte_get(pte);
new_val = old_val;
#ifdef CONFIG_X86_KPTI
if (is_flipped_pte(new_val)) {
/* Page was flipped for KPTI. Un-flip it */
new_val = ~new_val;
}
#endif /* CONFIG_X86_KPTI */
if (reset) {
new_val = reset_pte(new_val);
} else if (clear) {
new_val = 0;
} else {
new_val = ((new_val & ~update_mask) |
(update_val & update_mask));
}
new_val = pte_finalize_value(new_val, user_table, PTE_LEVEL);
} while (atomic_pte_cas(pte, old_val, new_val) == false);
#ifdef CONFIG_X86_KPTI
if (is_flipped_pte(old_val)) {
/* Page was flipped for KPTI. Un-flip it */
old_val = ~old_val;
}
#endif /* CONFIG_X86_KPTI */
return old_val;
}
/**
* Low level page table update function for a virtual page
*
* For the provided set of page tables, update the PTE associated with the
* virtual address to a new value, using the mask to control what bits
* need to be preserved.
*
* It is permitted to set up mappings without the Present bit set, in which
* case all other bits may be used for OS accounting.
*
* This function is atomic with respect to the page table entries being
* modified by another CPU, using atomic operations to update the requested
* bits and return the previous PTE value.
*
* Common mask values:
* MASK_ALL - Update all PTE bits. Existing state totally discarded.
* MASK_PERM - Only update permission bits. All other bits and physical
* mapping preserved.
*
* @param ptables Page tables to modify
* @param virt Virtual page table entry to update
* @param entry_val Value to update in the PTE (ignored if OPTION_RESET or
* OPTION_CLEAR)
* @param [out] old_val_ptr Filled in with previous PTE value. May be NULL.
* @param mask What bits to update in the PTE (ignored if OPTION_RESET or
* OPTION_CLEAR)
* @param options Control options, described above
*
* @retval 0 if successful
* @retval -EFAULT if large page encountered or missing page table level