caching - How to get L3 cache info (size, line length) on Intel processor using cpuid? -
i encountered trouble during getting l3 cache info on intel processors. getting l3 line length on amd simple, this:
mov eax, 0x80000006 cpuid shl edx, 24 shr edx, 24
the same operation on intels more complicated. got might done using sequence:
mov eax, 2 cpuid
and pasring register values manual: http://www.microbe.cz/docs/cpuid.pdf (page 26, "table 2-7. descriptor decode values").
but program not found of enumerated descriptors , returns 0 cache size , line length.
is there simpler and/or sufficient method cache size , line length on intels?
here full code. cpuid output (eax, ebx, ecx, edx) pushed onto stack, each value compared hardcoded descriptors list. comparation made on lower 8 bits, these bits shrinked.
__declspec(dllexport) __declspec(naked) void getmetriclevel2(int &length) { __asm { // check cpuid availability pushfd pop eax mov ebx, eax xor eax, 00200000h push eax popfd pushfd pop eax cmp eax, ebx jnz has_cpuid mov edx, -1 // return -1 reference jmp ret_arg has_cpuid: mov eax, 2 // l3 intel, incomplete mov ecx, 0 cpuid push ecx or ecx, eax or ecx, ebx or ecx, edx cmp ecx, 0 pop ecx // experimental je cpu_amd // if registers 0, try amd scheme cpu_intel: push ebp mov ebp, 0 push 0 push eax // store counter jmp call_begin cycle_begin: pop ecx inc ecx push ecx push eax mov eax, 2 cpuid call_begin: push eax push ebx push ecx push edx mov ch, 4 parse_reg: pop edx mov cl, 4 parse_descr: dd0h://512,4w cmp dl, 0xd0 jne dd1h add ebp, 512d jmp miss_l3cache dd1h://1024,4w cmp dl, 0xd1 jne dd2h add ebp, 1024d jmp miss_l3cache dd2h://2048,4w cmp dl, 0xd2 jne dd6h add ebp, 2048d jmp miss_l3cache dd6h://1024,8w cmp dl, 0xd6 jne dd7h add ebp, 1024d jmp miss_l3cache dd7h://2048,8w cmp dl, 0xd7 jne dd8h add ebp, 2048d jmp miss_l3cache dd8h://4096,8w cmp dl, 0xd8 jne ddch add ebp, 4096d jmp miss_l3cache ddch://1536,12w cmp dl, 0xdc jne dddh add ebp, 1536d jmp miss_l3cache dddh://3072,12w cmp dl, 0xdd jne ddeh add ebp, 3072d jmp miss_l3cache ddeh://6144,12w cmp dl, 0xde jne de2h add ebp, 6144d jmp miss_l3cache de2h://2048,16w cmp dl, 0xe2 jne de3h add ebp, 2048d jmp miss_l3cache de3h://4096,16w cmp dl, 0xe3 jne de4h add ebp, 4096d jmp miss_l3cache de4h://8192,16w cmp dl, 0xe4 jne deah add ebp, 8192d jmp miss_l3cache deah://12mb,24w cmp dl, 0xea jne debh add ebp, 12288d jmp miss_l3cache debh://18mb,24w cmp dl, 0xeb jne dech add ebp, 18432d jmp miss_l3cache dech://24mb,24w cmp dl, 0xec jne miss_l3cache add ebp, 24576d miss_l3cache: dec cl cmp cl, 0 shr edx, 8 // it's 8-bit descriptor jne parse_descr dec ch cmp ch, 0 jne parse_reg call_finish: pop eax cmp al, 0 je cycle_finish // replace je dec al jmp cycle_begin cycle_finish: mov edx, ebp shl edx, 8 // 8 bits cache string length mov dl, 64d // intel has 64 byte l3 string add esp, 4 pop ebp jmp ret_arg cpu_amd: mov eax, 0x80000006 // l3 amd cpuid shl edx, 24 shr edx, 24 ret_arg: mov eax, [esp+4] // first argument lies here mov [eax], edx // return reference ret } }
there number of problems code. should use __cpuid
compiler intrinsic , write in c++. it'll make code easier write , maintain.
there 2 major problems code. first you're not using cpuid function 2 correctly. value in ecx ignored when use function. second you're not using cpuid function 4 determine cache size when function 2 returns 0ffh
descriptor.
other problems code include:
- not ignoring invalid register values returned function 2 high bit set.
- not handling number of cache descriptors describe l3 caches.
- your inner loop byte counter isn't used since
shr edx, 8
sets flags. loop works anyways because when edx becomes 0 doesn't contain more possible l3 descriptors.
part of problem you're using outdated manual. should use latest intel software developers manual.
it's not tested, it's got transcription errors in cache descriptor switch statement, here's c implementation uses cpuid functions 2 , 4 determine size, associativity , cache line size of l3 cache:
#include <intrin.h> int get_intel_l3_info(unsigned *size, unsigned *assoc, unsigned *linesize) { int regs[4]; int i; __cpuid(regs, 0); /* maximum input value */ int max_leaf = regs[0]; if (max_leaf < 2) { return -1; /* no way find l3 cache info */ } __cpuid(regs, 1); /* additional information */ int family = (regs[0] >> 8) & 0xf; int model = (regs[0] >> 4) & 0xf; __cpuid(regs, 2); /* cache , tlb information */ regs[0] &= 0xffffff00; /* least significant byte of eax invalid */ (i = 0; < 4; i++) { if (regs[i] < 0) { /* invalid if significant bit set */ regs[i] = 0; } } unsigned char *descriptors = (unsigned char *) regs; const int kb = 1024; const int mb = 1024 * kb; #define retinfo(s, a, l) *size = (s); *assoc = (a); *linesize = (l); return 0 int use_leaf_4 = 0; (i = 0; < 32; i++) { switch(descriptors[i]) { case 0x22: retinfo(512 * kb, 4, 64); case 0x23: retinfo(1 * mb, 8, 64); case 0x25: retinfo(2 * mb, 8, 64); case 0x29: retinfo(4 * mb, 8, 64); case 0x40: retinfo(0, 0, 0); /* no l3 cache */ case 0x46: retinfo(4 * mb, 4, 64); case 0x47: retinfo(8 * mb, 8, 64); case 0x49: if (family == 0x0f && model == 0x06) { retinfo(4 * mb, 16, 64); } break; case 0x4a: retinfo(6 * mb, 12, 64); case 0x4b: retinfo(8 * mb, 16, 64); case 0x4c: retinfo(12 * mb, 12, 64); case 0x4d: retinfo(16 * mb, 16, 64); case 0xd0: retinfo(512 * kb, 4, 64); case 0xd1: retinfo(1 * mb, 4, 64); case 0xd6: retinfo(1 * mb, 8, 64); case 0xd7: retinfo(2 * mb, 8, 64); case 0xd8: retinfo(4 * mb, 8, 64); case 0xdc: retinfo(1 * mb + 512 * kb, 12, 64); case 0xdd: retinfo(3 * mb, 12, 64); case 0xde: retinfo(6 * mb, 12, 64); case 0xe2: retinfo(2 * mb, 16, 64); case 0xe3: retinfo(4 * mb, 16, 64); case 0xe4: retinfo(8 * mb, 16, 64); case 0xea: retinfo(12 * mb, 24, 64); case 0xeb: retinfo(18 * mb, 24, 64); case 0xec: retinfo(24 * mb, 24, 64); case 0xff: use_leaf_4 = 1; break; } } if (!use_leaf_4 || max_leaf < 4) { return -1; /* failed, no l3 info found */ } = 0; while(1) { __cpuidex(regs, 4, i); /* deterministic cache parameters */ if ((regs[0] & 0x1f) == 0) { return retinfo(0, 0, 0); /* no l3 cache */ } if (((regs[0] >> 5) & 0x7) == 3) { int lsize = (regs[1] & 0xfff) + 1; int partitions = ((regs[1] >> 12) & 0x3ff) + 1; int ways = ((regs[1] >> 22) & 0x3ff) + 1; int sets = regs[2] + 1; retinfo(ways * partitions * lsize * sets, ways, lsize); } i++; } }
Comments
Post a Comment