Mar 12, 2016

Using RDTSC instruction that returns CPU TSC (Time Stamp Counter)

/*
https://en.wikipedia.org/wiki/Time_Stamp_Counter
https://ru.wikipedia.org/wiki/Rdtsc
*/
#include <stdio.h>
typedef unsigned long long uint64;
int main() {
uint64 val;
unsigned int h, l;
for (int i=0; i<=10; i++) {
__asm__ __volatile__("rdtsc" : "=a" (l), "=d" (h));
val = ((uint64)l) | (((uint64)h) << 32);
printf("%llu \n", val);
}
}
view raw rdtsc.c hosted with ❤ by GitHub
// http://developers.redhat.com/blog/2016/03/11/practical-micro-benchmarking-with-ltrace-and-sched/
/* One drawback of the RDTSC instruction is that the CPU is allowed to reorder
it relative to other instructions, which causes noise in our results. Fortunately,
Intel has provided an RDTSCP instruction that’s more deterministic. We’ll pair
that with a CPUID instruction which acts as a memory barrier, resulting in this: */
static __inline__ int64_t rdtsc_s(void)
{
unsigned a, d;
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
asm volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long)a) | (((unsigned long)d) << 32);
}
static __inline__ int64_t rdtsc_e(void)
{
unsigned a, d;
asm volatile("rdtscp" : "=a" (a), "=d" (d));
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
return ((unsigned long)a) | (((unsigned long)d) << 32);
}
. . .
clocks_before = rdtsc_s ();
p = malloc (i); /* Test goes here */
clocks_after = rdtsc_e ();
clocks_per_malloc = clocks_after - clocks_before;
// let the OS use CPU #0
// boot options:
// linux . . . isolcpus=1,2,3,4,5,6,7
// check:
// taskset -p $$
// Interrupt affinity:
// cd /proc/irq
// for i in */smp_affinity; do echo 1 > $i; done
view raw rdtscp.c hosted with ❤ by GitHub