Monday, December 20, 2010

NMI handler

Trying to register a NMI handler in the kernel created a lot of kernel crashes.
Oprofile does low level kernel hacks to install there nmi handler.
I found there was a recent patch that reverted an easy to use NMI register function: http://lkml.indiana.edu/hypermail/linux/kernel/0605.1/0524.html
So I reverted some of the code form the patch to register my own.
______________
Looking into the oprofile code, I decided to not do what I orginally posted.

I went ahead and ported the oprofile NMI handler over. It does a low level memory operation to switch out the kernels NMI handler with its own. This way the response time to receive the interrupt is minimized.

Friday, December 17, 2010

Reading and writing to performance counter

So I have never written real x86 assembly code. So trying to figure out how to initialize a performance counter was a feat.

The intel manual talks a lot about to determine functionality with notation like this:
CPUID.0AH:EAX[bits 7:0] (which is the performance monitor version number). Initially it made no sense to me.

I soon discovered that CPUID is actually a instruction that read the EAX register and puts the results into EAX, EBX, ECX, and EDX. So the 0aH is notation for hex 0x0a to be put into EAX. And then read bits 0 through 7 in EAX to get the result.

Thanks to inline assembly, it makes this easy:

unsigned a, b, c, d;
asm("cpuid" //assembly instruction
: "=a" (a), "=b" (b) , "=c" (c), "=d" (d) //output: the letter in quotations is the register and the letter in paranthesis is the variable
: "a" (0xa) ); //input: similar notation as output, except I am passing the hex in the parenthesis directly

v = a & 0xff; //mask the first 7 bits
printk(KERN_INFO "Version Identifier: %u \n", v);

__

So now that I know the version number, I can setup a counter for use. Every performance counter on the intel chip-set as a corresponding event selector address space (defined in the intel manual).
The region has 2 areas for the event number and mask. The rest of the region is flags for fine tuning the counting operation. To start the counter I wrote a unsigned int to the region, as it is only the lower 32 bits.
To prepare the unsigned int I created the following function:

unsigned preparePERFEVTSEL(unsigned event, unsigned mask){
int _INV = 0x0; //invert counter mask
int _EN = 0x1; //enable
int _ANY = 0x1; //tracks threads accross processors int _INT = 0x0; //interupt, turn on for replaying
int _PC = 0x0; //pin control
int _E = 0x0; //edge
int _OS = 0x0; //OS detection, TODO: do we want system calls track
ed?
int _USR = 0x1; //track user level code

unsigned prepared = 0x0;

prepared |= event & 0xff;
prepared |= (mask & 0xff) <<8;
prepared |= (_USR & 0x1) <<16 ;
prepared |= (_OS & 0x1) <<17 ;
prepared |= (_E & 0x1) <<18 ;
prepared |= (_PC & 0x1) <<19 ;
prepared |= (_INT & 0x1) <<20 ;
prepared |= (_ANY & 0x1) <<21 ;
prepared |= (_EN & 0x1) <<22 ;
prepared |= (_INV & 0x1) <<23 ;

return prepared;


}

I then was able to write to the event selection address space:

unsigned event = preparePERFEVTSEL(br_inst, br_umask);
reg = IA32_PERFEVTSEL0; //#defined
asm("wrmsr"
:
: "a" (event), "c" (reg));



___________________________
Update:

All of the MSR operations have a nice wrapper within the linux kernel linux/msr.h
I can do a simple wrmsrl(MSR location, unsinged long long input) and rdmsrl(MSR location, unsinged long long ouput)

These are long operations, theres also the correspoing wrmsr(msr location, high, low) similarly rdmsr(msr location, high, low)

Also...
There seams to be a perf watchdog code that allocs the performance counters to coordinate between processors. It looks like the kernel already uses performance counter 0 for its own doing.

Wednesday, December 15, 2010

to get 64 bit support working:
Modify include/asm-x86/unistd_64.h
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -635,6 +635,12 @@ __SYSCALL(__NR_timerfd, sys_timerfd)
__SYSCALL(__NR_eventfd, sys_eventfd)
#define __NR_fallocate 285
__SYSCALL(__NR_fallocate, sys_fallocate)
+#define __NR_start_rec 286
+__SYSCALL(__NR_start_rec, sys_start_rec)
+#define __NR_stop_rec 287
+__SYSCALL(__NR_stop_rec, sys_stop_rec)
+#define __NR_rec_owner 288
+__SYSCALL(__NR_rec_owner, sys_rec_owner)

#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR

Saturday, October 16, 2010

Creating a system call

I am starting a project with Professor Yang.

Its based of of Dunlap, Lucchetti and Chen's Paper "Execution Replay for Multiprocessor Virtual Machines."

The paper uses page fault protection to replay execution on Multiprocessors. It uses CREW, which is concurrent-read, exclusive-write. This allows us to log the ordering of each read and write.

The paper modified Hypervisor to implement CREW events, but I will be modifying linux kernel 2.6.24 to implement it.

I wanted to be able to record the instruction and branch of a thread to be able to replay table permissions at the precise location as the recorded process

2.6.24 recently merged i386 and x86 directories since 32 and 64 bit architectures usually require the same patches. To add a system call to the 32 bit side of the kernel I modified the following:

First add the system calls to syscall_table:
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -324,3 +324,5 @@ ENTRY(sys_call_table)
.long sys_timerfd
.long sys_eventfd
.long sys_fallocate
+ .long sys_start_rec /*325*/
+ .long sys_stop_rec
+ .long sys_rec_owner

Next modify unistd.h:

--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -330,10 +330,12 @@
#define __NR_timerfd 322
#define __NR_eventfd 323
#define __NR_fallocate 324
+ #define __NR_start_rec 325
+#define __NR_stop_rec 326
+#define __NR_rec_owner 327


#ifdef __KERNEL__

-#define NR_syscalls 325
+#define NR_syscalls 328


Next I created arch/x86/kernel/record.c



1 #include
2 #include
3 //Create mock syscalls to use module
4
5
6 long (*start_rec)(void) = NULL;
7 EXPORT_SYMBOL(start_rec);
8
9 asmlinkage long sys_start_rec(void){
10 return start_rec ? start_rec() : -ENOSYS;
11 }
12
13 long (*stop_rec)(void) = NULL;
14 EXPORT_SYMBOL(stop_rec);
15
16 asmlinkage long sys_stop_rec(void){
17 return stop_rec ? stop_rec() : -ENOSYS;
18 }
19
20 long (*rec_owner)(void) = NULL;
21 EXPORT_SYMBOL(rec_owner);
22
23 asmlinkage long sys_rec_owner(void){
24 return rec_owner ? rec_owner() : -ENOSYS;
25 }


I then created a module to handle the actual system calls:

1 /*
2 * branchInfo.c -- recording replaying instructions
3 */
4 #include /* Needed by all modules */
5 #include /* Needed for KERN_INFO */
6 #include
7 #include "br_msr.h"
8 #include "br_record.h"
9 #include "br_pmcaccess.h"
10
11 #define DRIVER_AUTHOR "Blake Arnold "
12 #define DRIVER_DESC "Branch Record module for modified kernel"
13
14 extern long (*start_rec)(void);
15 extern long (*stop_rec)(void);
16 extern long (*rec_owner)(void);
17
18
19 static int __init init_hello(void)
20 {
21 //TODO: check cpuinfo
22 printk(KERN_INFO "Loading Branch recording module\n");
23 /*
24 * A non 0 return means init_module failed; module can't be loaded.
25 */
26 start_rec = &start_record;
27 stop_rec = &stop_record;
28 rec_owner = &record_owner;
29 return 0;
30 }
31
32 static void __exit cleanup_hello(void)
33 {
34 //TODO: stop performance counter
35 printk(KERN_INFO "Unloading branch recording module\n");
36 start_rec = NULL;
37 stop_rec = NULL;
38 rec_owner = NULL;
39 }
40
41
42
43 module_init(init_hello);
44 module_exit(cleanup_hello);
45
46 MODULE_LICENSE("GPL");
47 MODULE_AUTHOR(DRIVER_AUTHOR); /* Who wrote this module? */
48 MODULE_DESCRIPTION(DRIVER_DESC); /* What does this module do */
49
50 MODULE_SUPPORTED_DEVICE("testdevice");



1 #include /* Needed for KERN_INFO */
2 #include
3 #include
4 #include
5 #include
6 #include "br_msr.h"
7 #include "br_record.h"
8 #include "br_pmcaccess.h"
9
10
11 struct recording recordingList;
12 long start_record(void){
13 printk(KERN_INFO "starting performance counter\n");
14 if(probeCPUID() < tmp=" list_entry(pos," tmp =" kmalloc(sizeof(struct" reg =" IA32_PMC1;" reg =" IA32_PMC2;">list, &recordingList.list);
55 return 0;
56
57 }