-
Notifications
You must be signed in to change notification settings - Fork 0
/
pmbd.c
4875 lines (4149 loc) · 154 KB
/
pmbd.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Intel Persistent Memory Block Driver
* Copyright (c) <2011-2013>, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
/*
* Intel Persistent Memory Block Driver (v0.9)
*
* Parts derived with changes from drivers/block/brd.c, lib/crc32.c, and
* arch/x86/lib/mmx_32.c
*
* Intel Corporation <[email protected]>
* 03/24/2011
*/
/*
*******************************************************************************
* Persistent Memory Block Device Driver
*
* USAGE:
* % sudo modprobe pmbd mode="pmbd<#>;hmo<#>;hms<#>;[OPTION1];[OPTION2];..>"
*
* GENERAL OPTIONS:
* - pmbd<#,..>: a sequence of integer numbers setting PMBD device sizes (in
* units of GBs). For example, mode="pmbd4,1" means creating a
* 4GB and a 1GB PMBD device (/dev/pma and /dev/pmb).
*
* - HM|VM: choose two types of PMBD devices
* - VM: vmalloc() based
* - HM: HIGH_MEM based (default)
* - In /boot/grub/grub.conf, add "mem=<n>G memmap=<m>G$<n>G"
* to reserve the high m GBs for PM, starting from offset n
* GBs in physical memory
*
* - hmo<#>: if HM is set, setting the starting physical mem address
* (in units of GBs).
*
* - hms<#>: if HM is set, setting the remapping memory size (in GBs)
*
* - pmap<Y|N> set private mapping (Y) or not (N default). using
* pmap_atomic_pfn() to dynamically map/unmap the
* to-be-accessed PM page for protection purpose.
* This option must work with HM enabled. In the Linux boot
* option, "mem" option must be removed.
*
* - nts<Y|N> set non-temporal store/sfence (Y) or not (N default).
*
* - wb<Y|N>: use write barrier (Y) or not (N default)
*
* - fua<Y|N> use WRITE_FUA (Y default) or not (N) (only effective for
* Linux 3.2.1)
*
* SIMULATION OPTIONS:
*
* - simmode<#,#..> set the simulation mode for each PMBD device
* - 0 for simulating the whole device
* - 1 for simulating the PM space only
* Note that simulating the PM space may cause some system
* warning of soft lockup. To disable it, add nonsoftlockup
* in the boot options.
*
* - rdlat<#,#..>: a sequence of integer numbers setting emulated read
* latencies (in units of nanoseconds) for reading each
* sector. Each number is corresponding to a device. Default
* value is 0.
*
* - wrlat<#,#..>: set emulated write access latencies (see rdlat)
*
* - rdbw<#,#..>: a sequence of integer numbers setting emulated read
* bandwidth (in units of MB/sec) for reading each sector.
* Each number corresponds to a device. Default value is 0;
*
* - wrbw<#,#..>: set emulated write bandwidth (see rdbw)
*
* - rdsx<#,#..>: set the slowdown ratio (x) for reads as compared to DRAM
*
* - wrsx<#,#..>: set the slowdown ratio (x) for writes as compared to DRAM
*
* - rdpause<#,#..>: set the injected delay (cycles per page) for read (not
* for emulation, just inject latencies for each read per page)
*
* - wrpause<#,#..>: set the injected delay (cycles per page) for write (not for
* emulation, just inject latencies for each read per page).
*
* - adj<#>: offset the overhead with estimated system overhead. Default
* is 4us, however, this could vary system by system.
*
* WRITE PROTECTION:
*
* - wrprot<Y|N>: provide write protection on PM space by setting page
* read-only (default: N). This option is incompatible with pmap.
*
* - wpmode<#,#,..> write protection mode: use the PTE change (0 default) or
* switch CR0/WP bit (1)
*
* - wrverify<Y|N>: read out the data for verification after writing into PM
* space
*
* - clflush<Y|N>: flush CPU cache or not (default: N)
*
* - checksum<Y|N>: use checksum to provide further protection from data
* corruption (default: N)
*
* - lock<Y|N>: lock the on-access PM page to serialize accesses (default: Y)
*
* - bufsize<#,#,#.#...> -- the buffer size in MBs (for speeding up write
* protection) 0 means no buffer, minimum size is 16 MBs
*
* - bufnum<#> the number of buffers for a pmbd device (16 buffers, at
* least 1 if using buffering, 0 will disable buffer mode)
*
* - bufstride<#> the number of contiguous blocks(4KB) mapped into one
* buffer (the bucket size for round-robin mapping) (1024 in default)
*
* - batch<#,#> the batch size (num of pages) for flushing PMBD buffer (1
* means no batching)
*
* MISC OPTIONS:
*
* - subupdate<Y|N> only update changed cachelines of a page (check
* PMBD_CACHELINE_SIZE, default: N)
*
* - mgb<Y|N>: setting mergeable or not (default: Y)
*
* - cache<WB|WC|UM|UC>:
* WB -- write back (both read/write cache used)
* WC -- write combined (write through but cachable)
* UM -- uncachable but write back
* UC -- write through and uncachable
*
* - timestat<Y|N> enable the detailed timing statistics (/proc/pmbd/pmbdstat) or
* not (default: N). This will cause significant performance loss.
*
* EXAMPLE:
* mode="pmbd2,1;rdlat100,2000;wrlat500,4000;rdbw100,100;wrbw100,100;HM;hmo4;hms3;
* mgbY;flushY;cacheWB;wrprotY;wrverifyY;checksumY;lockY;rammode0,1;bufsize16,0;
* subupdateY;"
*
* Explanation: Create two PMBD devices, /dev/pma (2GB) and /dev/pmb (1GB).
* Insert 100ns and 500ns for reading and writing a sector to /dev/pma,
* respectively. Insert 2000ns and 4000ns for reading and writing a sector
* to /dev/pmb. Make the read/write bandwidth for both devices 100MB/sec.
* No system overhead adjustment is applied. We use 3GB high memory for the
* PMBD devices, starting from 4GB physical memory address. Make it
* mergeable, use writeback and flush CPU cache for the PM space, use write
* protection for PM space by setting PM space read-only, verify each
* write by reading out written data, use checksum to protect PM space, use
* spinlock to protect from corruption caused by concurrent accesses, the
* first device is applied without write protection, the second device is
* applied with write protection, and use sub-page updates.
*
* NOTE:
* - We can create no more than 26 devices, 4 partitions each.
*
* FIXME:
* (1) We use an unoccupied major device num (261) temporarily
*******************************************************************************
*/
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/version.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/time.h>
#include <asm/timer.h>
#include <linux/cpufreq.h>
#include <linux/crc32.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/kthread.h>
#include <linux/sort.h>
#include <linux/timex.h>
#include <linux/proc_fs.h>
#include <asm/tlbflush.h>
#include <asm/i387.h>
#include <asm/asm.h>
#include "pmbd.h"
#include <linux/seq_file.h>
#if LINUX_VERSION_CODE == KERNEL_VERSION(3,11,0)
#include <linux/delay.h>
#endif
/* device configs */
static int max_part = 4; /* maximum num of partitions */
static int part_shift = 0; /* partition shift */
static LIST_HEAD(pmbd_devices); /* device list */
static DEFINE_MUTEX(pmbd_devices_mutex); /* device mutex */
static int map=0;
static int nomap=0;
static int off_r=0;
static int off_w=0;
static int size_r=0;
static int size_w=0;
static int page_r=0;
static int page_w=0;
static int file_r=0;
static int file_w=0;
static int ino[100]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
static int ino_rw[100];
static int n_ino=0;
static int map_count=0;
static int ino_super=0;
static int ino_file=0;
static int trans=0;
static int nonamepage=0;
/* /proc file system entry */
static struct proc_dir_entry* proc_pmbd = NULL;
static struct proc_dir_entry* proc_pmbdstat = NULL;
static struct proc_dir_entry* proc_pmbdcfg = NULL;
/* pmbd device default configuration */
static unsigned g_pmbd_type = PMBD_CONFIG_HIGHMEM; /* vmalloc(PMBD_CONFIG_VMALLOC) or reserve highmem (PMBD_CONFIG_HIGHMEM default) */
static unsigned g_pmbd_pmap = FALSE; /* use pmap_atomic() to map/unmap space on demand */
static unsigned g_pmbd_nts = FALSE; /* use non-temporal store (movntq) */
static unsigned g_pmbd_wb = FALSE; /* use write barrier */
static unsigned g_pmbd_fua = TRUE; /* use fua support (Linux 3.2.1) */
static unsigned g_pmbd_mergeable = TRUE; /* mergeable or not */
static unsigned g_pmbd_cpu_cache_clflush= FALSE; /* flush CPU cache or not*/
static unsigned g_pmbd_wr_protect = FALSE; /* flip PTE R/W bits for write protection */
static unsigned g_pmbd_wr_verify = FALSE; /* read out written data for verification */
static unsigned g_pmbd_checksum = FALSE; /* do checksum on PM data */
static unsigned g_pmbd_lock = TRUE; /* do spinlock on accessing a PM page */
static unsigned g_pmbd_subpage_update = FALSE; /* do subpage update (only write changed content) */
static unsigned g_pmbd_timestat = FALSE; /* do a detailed timestamp breakdown statistics */
static unsigned g_pmbd_ntl = FALSE; /* use non-temporal load (movntdqa)*/
static unsigned long g_pmbd_cpu_cache_flag = _PAGE_CACHE_WB; /* CPU cache flag (default - write back) */
/* high memory configs */
static unsigned long g_highmem_size = 0; /* size of the reserved physical mem space (bytes) */
static phys_addr_t g_highmem_phys_addr = 0; /* beginning of the reserved phy mem space (bytes)*/
static void* g_highmem_virt_addr = NULL; /* beginning of the reserve HIGH_MEM space */
static void* g_highmem_curr_addr = NULL; /* beginning of the available HIGH_MEM space for alloc*/
/* module parameters */
static unsigned g_pmbd_nr = 0; /* num of PMBD devices */
static unsigned long long g_pmbd_size[PMBD_MAX_NUM_DEVICES]; /* PMBD device sizes in units of GBs */
static unsigned long long g_pmbd_rdlat[PMBD_MAX_NUM_DEVICES]; /* access latency for read (nanosecs) */
static unsigned long long g_pmbd_wrlat[PMBD_MAX_NUM_DEVICES]; /* access latency for write nanosecs) */
static unsigned long long g_pmbd_rdbw[PMBD_MAX_NUM_DEVICES]; /* bandwidth for read (MB/sec) */
static unsigned long long g_pmbd_wrbw[PMBD_MAX_NUM_DEVICES]; /* bandwidth for write (MB/sec)*/
static unsigned long long g_pmbd_rdsx[PMBD_MAX_NUM_DEVICES]; /* read slowdown (x) */
static unsigned long long g_pmbd_wrsx[PMBD_MAX_NUM_DEVICES]; /* write slowdown (x)*/
static unsigned long long g_pmbd_rdpause[PMBD_MAX_NUM_DEVICES]; /* read pause (cycles per page) */
static unsigned long long g_pmbd_wrpause[PMBD_MAX_NUM_DEVICES]; /* write pause (cycles per page)*/
static unsigned long long g_pmbd_simmode[PMBD_MAX_NUM_DEVICES]; /* simulating PM space (1) or the whole device (0 default) */
static unsigned long long g_pmbd_adjust_ns = 0; /* nanosec of adjustment to offset system overhead */
static unsigned long long g_pmbd_rammode[PMBD_MAX_NUM_DEVICES]; /* do write optimization or not */
static unsigned long long g_pmbd_bufsize[PMBD_MAX_NUM_DEVICES]; /* the buffer size (in MBs) */
static unsigned long long g_pmbd_buffer_batch_size[PMBD_MAX_NUM_DEVICES]; /* the batch size (num of pages) for flushing PMBD buffer */
static unsigned long long g_pmbd_wpmode[PMBD_MAX_NUM_DEVICES]; /* write protection mode: PTE change (0 default) and CR0 Switch (1)*/
static unsigned long long g_pmbd_num_buffers = 0; /* number of individual buffers */
static unsigned long long g_pmbd_buffer_stride = 1024; /* number of contiguous PBNs belonging to the same buffer */
/* definition of functions */
static inline uint64_t cycle_to_ns(uint64_t cycle);
static inline void sync_slowdown_cycles(uint64_t cycles);
static uint64_t emul_start(PMBD_DEVICE_T* pmbd, int num_sectors, int rw);
static uint64_t emul_end(PMBD_DEVICE_T* pmbd, int num_sectors, int rw, uint64_t start);
/*
* *************************************************************************
* parse module parameters functions
* *************************************************************************
*/
static char *mode = "";
module_param(mode, charp, 444);
MODULE_PARM_DESC(mode, USAGE_INFO);
/* print pmbd configuration info */
static void pmbd_print_conf(void)
{
int i;
#ifndef CONFIG_X86
printk(KERN_INFO "pmbd: running on a non-x86 platform, check ioremap()...\n");
#endif
printk(KERN_INFO "pmbd: cacheline_size=%d\n", PMBD_CACHELINE_SIZE);
printk(KERN_INFO "pmbd: PMBD_SECTOR_SIZE=%lu, PMBD_PAGE_SIZE=%lu\n", PMBD_SECTOR_SIZE, PMBD_PAGE_SIZE);
printk(KERN_INFO "pmbd: g_pmbd_type = %s\n", PMBD_USE_VMALLOC()? "VMALLOC" : "HIGH_MEM");
printk(KERN_INFO "pmbd: g_pmbd_mergeable = %s\n", PMBD_IS_MERGEABLE()? "YES" : "NO");
printk(KERN_INFO "pmbd: g_pmbd_cpu_cache_clflush = %s\n", PMBD_USE_CLFLUSH()? "YES" : "NO");
printk(KERN_INFO "pmbd: g_pmbd_cpu_cache_flag = %s\n", PMBD_CPU_CACHE_FLAG());
printk(KERN_INFO "pmbd: g_pmbd_wr_protect = %s\n", PMBD_USE_WRITE_PROTECTION()? "YES" : "NO");
printk(KERN_INFO "pmbd: g_pmbd_wr_verify = %s\n", PMBD_USE_WRITE_VERIFICATION()? "YES" : "NO");
printk(KERN_INFO "pmbd: g_pmbd_checksum = %s\n", PMBD_USE_CHECKSUM()? "YES" : "NO");
printk(KERN_INFO "pmbd: g_pmbd_lock = %s\n", PMBD_USE_LOCK()? "YES" : "NO");
printk(KERN_INFO "pmbd: g_pmbd_subpage_update = %s\n", PMBD_USE_SUBPAGE_UPDATE()? "YES" : "NO");
printk(KERN_INFO "pmbd: g_pmbd_adjust_ns = %llu ns\n", g_pmbd_adjust_ns);
printk(KERN_INFO "pmbd: g_pmbd_num_buffers = %llu\n", g_pmbd_num_buffers);
printk(KERN_INFO "pmbd: g_pmbd_buffer_stride = %llu blocks\n", g_pmbd_buffer_stride);
printk(KERN_INFO "pmbd: g_pmbd_timestat = %u \n", g_pmbd_timestat);
printk(KERN_INFO "pmbd: HIGHMEM offset [%llu] size [%lu] Private Mapping (%s) (%s) (%s) Write Barrier(%s) FUA(%s)\n",
g_highmem_phys_addr, g_highmem_size, (PMBD_USE_PMAP()? "Enabled" : "Disabled"),
(PMBD_USE_NTS()? "Non-Temporal Store":"Temporal Store"),
(PMBD_USE_NTL()? "Non-Temporal Load":"Temporal Load"),
(PMBD_USE_WB()? "Enabled": "Disabled"),
(PMBD_USE_FUA()? "Enabled":"Disabled"));
/* for each pmbd device */
for (i = 0; i < g_pmbd_nr; i ++) {
printk(KERN_INFO "pmbd: /dev/pm%c (%d)[%llu GB] read[%llu ns %llu MB/sec (%llux) (pause %llu cyc/pg)] write[%llu ns %llu MB/sec (%llux) (pause %llu cyc/pg)] [%s] [Buf: %llu MBs, batch %llu pages] [%s] [%s]\n",
'a'+i, i, g_pmbd_size[i], g_pmbd_rdlat[i], g_pmbd_rdbw[i], g_pmbd_rdsx[i], g_pmbd_rdpause[i], g_pmbd_wrlat[i], g_pmbd_wrbw[i], g_pmbd_wrsx[i], g_pmbd_wrpause[i],\
(g_pmbd_rammode[i] ? "RAM" : "PMBD"), g_pmbd_bufsize[i], g_pmbd_buffer_batch_size[i], \
(g_pmbd_simmode[i] ? "Simulating PM only" : "Simulating the whole device"), \
(PMBD_USE_PMAP() ? "PMAP" : (g_pmbd_wpmode[i] ? "WP-CR0/WP" : "WP-PTE")));
if (g_pmbd_simmode[i] > 0){
printk(KERN_INFO "pmbd: ********************************* WARNING **************************************\n");
printk(KERN_INFO "pmbd: Using simmode%llu to simulate a slowed-down PM space may cause system soft lockup.\n", g_pmbd_simmode[i]);
printk(KERN_INFO "pmbd: To disable the warning message, please add \"nosoftlockup\" in the boot option. \n");
printk(KERN_INFO "pmbd: ********************************************************************************\n");
}
}
printk(KERN_INFO "pmbd: ****************************** WARNING ***********************************\n");
printk(KERN_INFO "pmbd: 1. Checksum mismatch can be detected but not handled \n");
printk(KERN_INFO "pmbd: 2. PMAP is incompatible with \"wrprotY\"\n");
printk(KERN_INFO "pmbd: **************************************************************************\n");
return;
}
/*
* Parse a string with config for multiple devices (e.g. mode="pmbd4,1,3;")
* @mode: input option string
* @tag: the tag being looked for (e.g. pmbd)
* @data: output in an array
*/
static int _pmbd_parse_multi(char* mode, char* tag, unsigned long long data[])
{
int nr = 0;
if (strlen(mode)) {
char* head = mode;
char* tail = mode;
char* end = mode + strlen(mode);
char tmp[128];
if ((head = strstr(mode, tag))) {
head = head + strlen(tag);
tail = head;
while(head < end){
int len = 0;
/* locate the position of the first non-number char */
for(tail = head; IS_DIGIT(*tail) && tail < end; tail++) {};
/* pick up the numbers */
len = tail - head;
if(len > 0) {
nr ++;
if (nr > PMBD_MAX_NUM_DEVICES) {
printk(KERN_ERR "pmbd: %s(%d) - too many (%d) device config for %s\n",
__FUNCTION__, __LINE__, nr, tag);
return -1;
}
strncpy(tmp, head, len); tmp[len] = '\0';
data[nr - 1] = simple_strtoull(tmp, NULL, 0);
}
/* check the next sequence of numbers */
for(; !IS_DIGIT(*tail) && tail < end; tail++) {
/* if we meet the first alpha char or space, clause ends */
if(IS_ALPHA(*tail) || IS_SPACE(*tail))
goto done;
};
/* move head to the next sequence of numbers */
head = tail;
}
}
}
done:
return nr;
}
/*
* Parse a string with config for all devices (e.g. mode="adj1000")
* @mode: input option string
* @tag: the tag being looked for (e.g. pmbd)
* @data: output
*/
static int _pmbd_parse_single(char* mode, char* tag, unsigned long long* data)
{
if (strlen(mode)) {
char* head = mode;
char* tail = mode;
char tmp[128];
if (strstr(mode, tag)) {
head = strstr(mode, tag) + strlen(tag);
for(tail=head; IS_DIGIT(*tail); tail++) {};
if(tail == head) {
return -1;
} else {
int len = tail - head;
strncpy(tmp, head, len); tmp[len] = '\0';
*data = simple_strtoull(tmp, NULL, 0);
}
}
}
return 0;
}
static void load_default_conf(void)
{
int i = 0;
for (i = 0; i < PMBD_MAX_NUM_DEVICES; i ++)
g_pmbd_buffer_batch_size[i] = PMBD_BUFFER_BATCH_SIZE_DEFAULT;
}
/* parse the module parameters (mode) */
static void pmbd_parse_conf(void)
{
int i = 0;
static unsigned enforce_cache_wc = FALSE;
load_default_conf();
if (strlen(mode)) {
unsigned long long data = 0;
/* check pmbd size/usable */
if (strstr(mode, "pmbd")) {
if( (g_pmbd_nr = _pmbd_parse_multi(mode, "pmbd", g_pmbd_size)) <= 0)
goto fail;
} else {
printk(KERN_ERR "pmbd: no pmbd size set\n");
goto fail;
}
/* rdlat/wrlat (emulated read/write latency) in nanosec */
if (strstr(mode, "rdlat"))
if (_pmbd_parse_multi(mode, "rdlat", g_pmbd_rdlat) < 0)
goto fail;
if (strstr(mode, "wrlat"))
if (_pmbd_parse_multi(mode, "wrlat", g_pmbd_wrlat) < 0)
goto fail;
/* rdbw/wrbw (emulated read/write bandwidth) in MB/sec*/
if (strstr(mode, "rdbw"))
if (_pmbd_parse_multi(mode, "rdbw", g_pmbd_rdbw) < 0)
goto fail;
if (strstr(mode, "wrbw"))
if (_pmbd_parse_multi(mode, "wrbw", g_pmbd_wrbw) < 0)
goto fail;
/* rdsx/wrsx (emulated read/write slowdown X) */
if (strstr(mode, "rdsx"))
if (_pmbd_parse_multi(mode, "rdsx", g_pmbd_rdsx) < 0)
goto fail;
if (strstr(mode, "wrsx"))
if (_pmbd_parse_multi(mode, "wrsx", g_pmbd_wrsx) < 0)
goto fail;
/* rdsx/wrsx (emulated read/write slowdown X) */
if (strstr(mode, "rdpause"))
if (_pmbd_parse_multi(mode, "rdpause", g_pmbd_rdpause) < 0)
goto fail;
if (strstr(mode, "wrpause"))
if (_pmbd_parse_multi(mode, "wrpause", g_pmbd_wrpause) < 0)
goto fail;
/* do write optimization */
if (strstr(mode, "rammode")){
printk(KERN_ERR "pmbd: rammode removed\n");
goto fail;
if (_pmbd_parse_multi(mode, "rammode", g_pmbd_rammode) < 0)
goto fail;
}
if (strstr(mode, "bufsize")){
if (_pmbd_parse_multi(mode, "bufsize", g_pmbd_bufsize) < 0)
goto fail;
for (i = 0; i < PMBD_MAX_NUM_DEVICES; i ++) {
if (g_pmbd_bufsize[i] > 0 && g_pmbd_bufsize[i] < PMBD_BUFFER_MIN_BUFSIZE){
printk(KERN_ERR "pmbd: bufsize cannot be smaller than %d MBs. Setting 0 to disable PMBD buffer.\n", PMBD_BUFFER_MIN_BUFSIZE);
goto fail;
}
}
}
/* numbuf and bufstride*/
if (strstr(mode, "bufnum")) {
if(_pmbd_parse_single(mode, "bufnum", &data) < 0) {
printk(KERN_ERR "pmbd: incorrect bufnum (must be at least 1)\n");
goto fail;
} else {
g_pmbd_num_buffers = data;
}
}
if (strstr(mode, "bufstride")) {
if(_pmbd_parse_single(mode, "bufstride", &data) < 0) {
printk(KERN_ERR "pmbd: incorrect bufstride (must be at least 1)\n");
goto fail;
} else {
g_pmbd_buffer_stride = data;
}
}
/* check the nanoseconds of overhead to compensate */
if (strstr(mode, "adj")) {
if(_pmbd_parse_single(mode, "adj", &data) < 0) {
printk(KERN_ERR "pmbd: incorrect adj\n");
goto fail;
} else {
g_pmbd_adjust_ns = data;
}
}
/* check PMBD device type */
if ((strstr(mode, "VM"))) {
g_pmbd_type = PMBD_CONFIG_VMALLOC;
} else if ((strstr(mode, "HM"))) {
g_pmbd_type = PMBD_CONFIG_HIGHMEM;
}
/* use pmap*/
if ((strstr(mode, "pmapY"))) {
g_pmbd_pmap = TRUE;
} else if ((strstr(mode, "pmapN"))) {
g_pmbd_pmap = FALSE;
}
if ((strstr(mode, "PMAP"))){
printk("WARNING: !!! pmbd: PMAP is not supported any more (use pmapY) !!!\n");
goto fail;
}
/* use nts*/
if ((strstr(mode, "ntsY"))) {
g_pmbd_nts = TRUE;
} else if ((strstr(mode, "ntsN"))) {
g_pmbd_nts = FALSE;
}
if ((strstr(mode, "NTS"))){
printk("WARNING: !!! pmbd: NTS is not supported any more (use ntsY) !!!\n");
goto fail;
}
/* use ntl*/
if ((strstr(mode, "ntlY"))) {
g_pmbd_ntl = TRUE;
enforce_cache_wc = TRUE;
} else if ((strstr(mode, "ntlN"))) {
g_pmbd_ntl = FALSE;
}
/* timestat */
if ((strstr(mode, "timestatY"))) {
g_pmbd_timestat = TRUE;
} else if ((strstr(mode, "timestatN"))) {
g_pmbd_timestat = FALSE;
}
/* write barrier */
if ((strstr(mode, "wbY"))) {
g_pmbd_wb = TRUE;
} else if ((strstr(mode, "wbN"))) {
g_pmbd_wb = FALSE;
}
/* write barrier */
if ((strstr(mode, "fuaY"))) {
g_pmbd_fua = TRUE;
} else if ((strstr(mode, "fuaN"))) {
g_pmbd_fua = FALSE;
}
/* check if HIGH_MEM PMBD is configured */
if (PMBD_USE_HIGHMEM()) {
if (strstr(mode, "hmo") && strstr(mode, "hms")) {
/* parse reserved HIGH_MEM offset */
if(_pmbd_parse_single(mode, "hmo", &data) < 0){
printk(KERN_ERR "pmbd: incorrect hmo\n");
g_highmem_phys_addr = 0;
goto fail;
} else {
g_highmem_phys_addr = data * 1024 * 1024 * 1024;
}
/* parse reserved HIGH_MEM size */
if(_pmbd_parse_single(mode, "hms", &data) < 0 || data == 0){
printk(KERN_ERR "pmbd: incorrect hms\n");
g_highmem_size = 0;
goto fail;
} else {
g_highmem_size = data * 1024 * 1024 * 1024;
}
} else {
printk(KERN_ERR "pmbd: hmo or hms not set ***\n");
goto fail;
}
}
/* check if mergeable */
if((strstr(mode,"mgbY")))
g_pmbd_mergeable = TRUE;
else if((strstr(mode,"mgbN")))
g_pmbd_mergeable = FALSE;
/* CPU cache flushing */
if((strstr(mode,"clflushY")))
g_pmbd_cpu_cache_clflush = TRUE;
else if((strstr(mode,"clflushN")))
g_pmbd_cpu_cache_clflush = FALSE;
/* CPU cache setting */
if((strstr(mode,"cacheWB"))) /* cache write back */
g_pmbd_cpu_cache_flag = _PAGE_CACHE_WB;
else if((strstr(mode,"cacheWC"))) /* cache write combined (through) */
g_pmbd_cpu_cache_flag = _PAGE_CACHE_WC;
else if((strstr(mode,"cacheUM"))) /* cache cachable but write back */
g_pmbd_cpu_cache_flag = _PAGE_CACHE_UC_MINUS;
else if((strstr(mode,"cacheUC"))) /* cache uncablable */
g_pmbd_cpu_cache_flag = _PAGE_CACHE_UC;
/* write protectable */
if((strstr(mode,"wrprotY")))
g_pmbd_wr_protect = TRUE;
else if((strstr(mode,"wrprotN")))
g_pmbd_wr_protect = FALSE;
/* write protectable */
if((strstr(mode,"wrverifyY")))
g_pmbd_wr_verify = TRUE;
else if((strstr(mode,"wrverifyN")))
g_pmbd_wr_verify = FALSE;
/* checksum */
if((strstr(mode,"checksumY")))
g_pmbd_checksum = TRUE;
else if((strstr(mode,"checksumN")))
g_pmbd_checksum = FALSE;
/* checksum */
if((strstr(mode,"lockY")))
g_pmbd_lock = TRUE;
else if((strstr(mode,"lockN")))
g_pmbd_lock = FALSE;
/* write protectable */
if((strstr(mode,"subupdateY")))
g_pmbd_subpage_update = TRUE;
else if((strstr(mode,"subupdateN")))
g_pmbd_subpage_update = FALSE;
/* batch */
if (strstr(mode, "batch")){
if (_pmbd_parse_multi(mode, "batch", g_pmbd_buffer_batch_size) < 0)
goto fail;
/* check if any batch size is set too small */
for (i = 0; i < PMBD_MAX_NUM_DEVICES; i ++) {
if (g_pmbd_buffer_batch_size[i] < 1){
printk(KERN_ERR "pmbd: buffer batch size cannot be smaller than 1 page (default: 1024 pages)\n");
goto fail;
}
}
}
/* simmode */
if (strstr(mode, "simmode")){
if (_pmbd_parse_multi(mode, "simmode", g_pmbd_simmode) < 0)
goto fail;
}
/* wpmode */
if (strstr(mode, "wpmode")){
if (_pmbd_parse_multi(mode, "wpmode", g_pmbd_wpmode) < 0)
goto fail;
}
} else {
goto fail;
}
/* apply some enforced configuration */
if (enforce_cache_wc) /* if ntl is used, we must use WC */
g_pmbd_cpu_cache_flag = _PAGE_CACHE_WC;
/* Done, print input options */
pmbd_print_conf();
return;
fail:
printk(KERN_ERR "pmbd: wrong mode config! Check modinfo\n\n");
g_pmbd_nr = 0;
return;
}
/*
* *****************************************************************
* simple emulation API functions
* pmbd_rdwr_pause - pause read/write for a specified cycles/page
* pmbd_rdwr_slowdown - slowdown read/write proportionally to DRAM
* *****************************************************************/
/* handle rdpause and wrpause options*/
static void pmbd_rdwr_pause(PMBD_DEVICE_T* pmbd, size_t bytes, unsigned rw)
{
uint64_t cycles = 0;
uint64_t time_p1, time_p2;
/* sanity check */
if (pmbd->rdpause == 0 && pmbd->wrpause == 0)
return;
/* start */
TIMESTAT_POINT(time_p1);
/* calculate the cycles to pause */
if (rw == READ && pmbd->rdpause){
cycles = MAX_OF((BYTE_TO_PAGE(bytes) * pmbd->rdpause), pmbd->rdpause);
} else if (rw == WRITE && pmbd->wrpause){
cycles = MAX_OF((BYTE_TO_PAGE(bytes) * pmbd->wrpause), pmbd->wrpause);
}
/* slow down now */
if (cycles)
sync_slowdown_cycles(cycles);
TIMESTAT_POINT(time_p2);
if(PMBD_USE_TIMESTAT()){
int cid = CUR_CPU_ID();
PMBD_STAT_T* pmbd_stat = pmbd->pmbd_stat;
pmbd_stat->cycles_pause[rw][cid] += time_p2 - time_p1;
}
return;
}
/* handle rdsx and wrsx options */
static void pmbd_rdwr_slowdown(PMBD_DEVICE_T* pmbd, int rw, uint64_t start, uint64_t end)
{
uint64_t cycles = 0;
uint64_t time_p1, time_p2;
/* sanity check */
if ( !((rw == READ && pmbd->rdsx > 1) || (rw == WRITE && pmbd->wrsx > 1)))
return;
if (end < start){
printk(KERN_WARNING "pmbd: %s(%d) end (%llu) is earlier than start (%llu)\n", \
__FUNCTION__, __LINE__, (unsigned long long) start, (unsigned long long)end);
return;
}
/* start */
TIMESTAT_POINT(time_p1);
/*FIXME: should we allow to do async slowdown? */
cycles = (end-start)*((rw == READ) ? (pmbd->rdsx - 1) : (pmbd->wrsx -1));
/*FIXME: should we minus a slack here (80-100cycles)? */
if (cycles)
sync_slowdown_cycles(cycles);
TIMESTAT_POINT(time_p2);
/* updating statistics */
if(PMBD_USE_TIMESTAT()){
int cid = CUR_CPU_ID();
PMBD_STAT_T* pmbd_stat = pmbd->pmbd_stat;
pmbd_stat->cycles_slowdown[rw][cid] += time_p2 - time_p1;
}
return;
}
/*
* set page's cache flags
* @vaddr: start virtual address
* @num_pages: the range size
*/
static void set_pages_cache_flags(unsigned long vaddr, int num_pages)
{
switch (g_pmbd_cpu_cache_flag) {
case _PAGE_CACHE_WB:
printk(KERN_INFO "pmbd: set PM pages cache flags (WB)\n");
set_memory_wb(vaddr, num_pages);
break;
case _PAGE_CACHE_WC:
printk(KERN_INFO "pmbd: set PM pages cache flags (WC)\n");
set_memory_wc(vaddr, num_pages);
break;
case _PAGE_CACHE_UC:
printk(KERN_INFO "pmbd: set PM pages cache flags (UC)\n");
set_memory_uc(vaddr, num_pages);
break;
case _PAGE_CACHE_UC_MINUS:
printk(KERN_INFO "pmbd: set PM pages cache flags (UM)\n");
set_memory_uc(vaddr, num_pages);
break;
default:
set_memory_wb(vaddr, num_pages);
printk(KERN_WARNING "pmbd: PM page attribute is not set - use WB\n");
break;
}
return;
}
/*
* *************************************************************************
* PMAP - Private mapping interface APIs
* *************************************************************************
*
* The private mapping is for providing write protection -- only when we need
* to access the PM page, we map it into the kernel virtual memory space, once
* we finish using it, we unmap it, so the spatial and temporal window left for
* bug attack is really small.
*
* Notes: pmap works similar to kmap_atomic*. It does the following:
* (1) pmap_create(): allocate 128 pages with vmalloc, these 128 pte mapping is
* saved to a backup place, and then be cleared to prevent accidental accesses.
* Each page is assigned correspondingly to the CPU ID where the calling thread
* is running on. So we support at most 128 CPU IDs.
* (2) pmap_atomic_pfn(): map the specified pfn into the entry, whose index is
* the ID of the CPU on which the current thread is running. The pfn is loaded
* into the corresponding pte entry and the corresponding TLB entry is flushed
* (3) punmap_atomic(): the specified pte entry is cleared, and the TLB entry
* is flushed
* (4) pmap_destroy(): the saved pte mapping of the 128 pages are restored, and
* vfree() is called to release the 128 pages allocated through vmalloc().
*
*/
#define PMAP_NR_PAGES (128)
static unsigned int pmap_nr_pages = 0; /* the total number of available pages for private mapping */
static void* pmap_va_start = NULL; /* the first PMAP virtual address */
static pte_t* pmap_ptep[PMAP_NR_PAGES]; /* the array of PTE entries */
static unsigned long pmap_pfn[PMAP_NR_PAGES]; /* the array of page frame numbers for restoring */
static pgprot_t pmap_prot[PMAP_NR_PAGES]; /* the array of page protection fields */
#define PMAP_VA(IDX) (pmap_va_start + (IDX) * PAGE_SIZE)
#define PMAP_IDX(VA) (((unsigned long)(VA) - (unsigned long)pmap_va_start) >> PAGE_SHIFT)
static inline void pmap_flush_tlb_single(unsigned long addr)
{
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
}
static inline void* update_pmap_pfn(unsigned long pfn, unsigned int idx)
{
void* va = PMAP_VA(idx);
pte_t* ptep = pmap_ptep[idx];
pte_t old_pte = *ptep;
pte_t new_pte = pfn_pte(pfn, pmap_prot[idx]);
if (pte_val(old_pte) == pte_val(new_pte))
return va;
/* update the pte entry */
set_pte_atomic(ptep, new_pte);
// set_pte(ptep, new_pte);
/* flush one single tlb */
__flush_tlb_one((unsigned long) va);
// pmap_flush_tlb_single((unsigned long) va);
/* return the old one for bkup */
return va;
}
static inline void clear_pmap_pfn(unsigned idx)
{
if (idx < pmap_nr_pages){
void* va = PMAP_VA(idx);
pte_t* ptep = pmap_ptep[idx];
/* clear the mapping */
pte_clear(NULL, (unsigned long) va, ptep);
__flush_tlb_one((unsigned long) va);
} else {
panic("%s(%d) illegal pmap idx\n", __FUNCTION__, __LINE__);
}
}
static int pmap_atomic_init(void)
{
unsigned int i;
/* checking */
if (pmap_va_start)
panic("%s(%d) something is wrong\n", __FUNCTION__, __LINE__);
/* allocate an array of dummy pages as pmap virtual addresses */
pmap_va_start = vmalloc(PAGE_SIZE * PMAP_NR_PAGES);
if (!pmap_va_start){
printk(KERN_ERR "pmbd:%s(%d) pmap_va_start cannot be initialized\n", __FUNCTION__, __LINE__);
return -ENOMEM;
}
pmap_nr_pages = PMAP_NR_PAGES;
/* set pages' cache flags, this flag would be saved into pmap_prot
* and will be applied together with the dynamically mapped page too (01/12/2012)*/
set_pages_cache_flags((unsigned long)pmap_va_start, pmap_nr_pages);
/* save the dummy pages' ptep, pfn, and prot info */
printk(KERN_INFO "pmbd: saving dummy pmap entries\n");
for (i = 0; i < pmap_nr_pages; i ++){
pte_t old_pte;
unsigned int level;
void* va = PMAP_VA(i);
/* get the ptep */
pte_t* ptep = lookup_address((unsigned long)(va), &level);
/* sanity check */
if (!ptep)
panic("%s(%d) mapping not found\n", __FUNCTION__, __LINE__);
old_pte = *ptep;
if (!pte_val(old_pte))
panic("%s(%d) invalid pte value\n", __FUNCTION__, __LINE__);
if (level != PG_LEVEL_4K)
panic("%s(%d) not PG_LEVEL_4K \n", __FUNCTION__, __LINE__);
/* save dummy entries */
pmap_ptep[i] = ptep;
pmap_pfn[i] = pte_pfn(old_pte);
pmap_prot[i] = pte_pgprot(old_pte);
/* printk(KERN_INFO "%s(%d): saving dummy pmap entries: %u va=%p pfn=%lx\n", \
__FUNCTION__, __LINE__, i, va, pmap_pfn[i]);
*/
}
/* clear the pte to make it illegal to access */
for (i = 0; i < pmap_nr_pages; i ++)
clear_pmap_pfn(i);
return 0;
}
static void pmap_atomic_done(void)
{
int i;
/* restore the dummy pages' pte */
printk(KERN_INFO "pmbd: restoring dummy pmap entries\n");
for (i = 0; i < pmap_nr_pages; i ++){
/* void* va = PMAP_VA(i);
printk(KERN_INFO "%s(%d): restoring dummy pmap entries: %d va=%p pfn=%lx\n", \
__FUNCTION__, __LINE__, i, va, pmap_pfn[i]);
*/
/* restore the old pfn */
update_pmap_pfn(pmap_pfn[i], i);
pmap_ptep[i]= NULL;
pmap_pfn[i] = 0;
}
/* free the dummy pages*/
if (pmap_va_start)
vfree(pmap_va_start);
else
panic("%s(%d): freeing dummy pages failed\n", __FUNCTION__, __LINE__);
pmap_va_start = NULL;
pmap_nr_pages = 0;
return;
}
static void* pmap_atomic_pfn(unsigned long pfn, PMBD_DEVICE_T* pmbd, unsigned rw)
{
void* va = NULL;