Files
membound/load.cpp

131 lines
5.0 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <iostream>
#include <stdlib.h>
#include "loadop.h"
#define ITERATIONS 1000000
#define CACHE_LINE_SIZE 64 // 典型缓存行大小
#define L1_CACHE_SIZE (32*1024) // 假设L1缓存为32KB
#define HUGE_SIZE (256*1024*1024) // 256MB大内存
// 测试缓存命中场景下的加载性能
void test_cache_hit(uint8_t *base, size_t size, bool aligned) {
volatile uint32_t sink = 0;
// 通过顺序访问所有缓存行来预热缓存
for (size_t i = 0; i < size; i += CACHE_LINE_SIZE) {
sink += *(base + i);
}
// 测量延迟
clock_t start = clock();
for (int i = 0; i < ITERATIONS; i++) {
size_t offset = (i * CACHE_LINE_SIZE) % size; // 跳跃缓存行
uint8_t *addr = base + offset;
load_v8i32(reinterpret_cast<uint64_t*>(addr));
sink += *addr;
}
clock_t end = clock();
double latency = (double)(end - start) * 1e9 / CLOCKS_PER_SEC / ITERATIONS;
printf("[Cache Hit] %s-Load Latency: %.2f ns\n",
aligned ? "Aligned" : "Unaligned", latency);
// 测量大量连续访问时的吞吐量
start = clock();
for (int i = 0; i < ITERATIONS; i += 4) { // 循环展开四次
size_t offset1 = (i * CACHE_LINE_SIZE) % size;
size_t offset2 = ((i+1) * CACHE_LINE_SIZE) % size;
size_t offset3 = ((i+2) * CACHE_LINE_SIZE) % size;
size_t offset4 = ((i+3) * CACHE_LINE_SIZE) % size;
load_v8i32(reinterpret_cast<uint64_t*>(base + offset1));
load_v8i32(reinterpret_cast<uint64_t*>(base + offset2));
load_v8i32(reinterpret_cast<uint64_t*>(base + offset3));
load_v8i32(reinterpret_cast<uint64_t*>(base + offset4));
sink += *(base + offset1) + *(base + offset2);
}
end = clock();
double throughput = (double)(end - start) * 1e9 /
CLOCKS_PER_SEC / (ITERATIONS);
printf("[Cache Hit] %s-Load Throughput: %.2f ns/op\n\n",
aligned ? "Aligned" : "Unaligned", throughput);
}
// 测试缓存未命中场景下的加载性能
void test_cache_miss(uint8_t *base, size_t size, bool aligned) {
volatile uint32_t sink = 0;
// 使用随机访问模式避免硬件预取优化
size_t *offsets = new size_t[ITERATIONS];
for (int i = 0; i < ITERATIONS; i++) {
offsets[i] = (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
}
// 测量延迟
clock_t start = clock();
for (int i = 0; i < ITERATIONS; i++) {
uint8_t *addr = base + offsets[i];
load_v8i32(reinterpret_cast<uint64_t*>(addr));
sink += *addr;
}
clock_t endt = clock();
double latency = (double)(endt - start) * 1e9 / CLOCKS_PER_SEC / ITERATIONS;
printf("[Cache Miss] %s-Load Latency: %.2f ns\n",
aligned ? "Aligned" : "Unaligned", latency);
// 测量多路并发加载时的吞吐量
start = clock();
for (int i = 0; i < ITERATIONS; i += 4) {
uint8_t *addr1 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
uint8_t *addr2 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
uint8_t *addr3 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
uint8_t *addr4 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
load_v8i32(reinterpret_cast<uint64_t*>(addr1));
load_v8i32(reinterpret_cast<uint64_t*>(addr2));
load_v8i32(reinterpret_cast<uint64_t*>(addr3));
load_v8i32(reinterpret_cast<uint64_t*>(addr4));
sink += *addr1 + *addr2;
}
endt = clock();
delete[] offsets;
double throughput = (double)(endt - start) * 1e9 /
CLOCKS_PER_SEC / (ITERATIONS);
printf("[Cache Miss] %s-Load Throughput: %.2f ns/op\n\n",
aligned ? "Aligned" : "Unaligned", throughput);
}
int main() {
// 分配小内存确保全在L1缓存
uint8_t *small_aligned = (uint8_t*)aligned_alloc(CACHE_LINE_SIZE, L1_CACHE_SIZE);
uint8_t *small_unaligned = small_aligned + 1; // 故意偏移1字节造非对齐
// 分配大内存触发持续Cache Miss
uint8_t *large_aligned = (uint8_t*)aligned_alloc(CACHE_LINE_SIZE, HUGE_SIZE);
uint8_t *large_unaligned = large_aligned + 1;
// 初始化数据
for (size_t i = 0; i < L1_CACHE_SIZE; i++) small_aligned[i] = i % 256;
for (size_t i = 0; i < HUGE_SIZE; i++) large_aligned[i] = i % 256;
// 测试缓存命中对齐访问
printf("-- L1 Cache Hit with Aligned Access --\n");
test_cache_hit(small_aligned, L1_CACHE_SIZE, true);
// 测试缓存命中非对齐访问
printf("-- L1 Cache Hit with Unaligned Access --\n");
test_cache_hit(small_unaligned, L1_CACHE_SIZE, false);
// 测试缓存未命中对齐访问
printf("-- Cache Miss with Aligned Access --\n");
test_cache_miss(large_aligned, HUGE_SIZE, true);
// 测试缓存未命中非对齐访问
printf("-- Cache Miss with Unaligned Access --\n");
test_cache_miss(large_unaligned, HUGE_SIZE, false);
free(small_aligned);
free(large_aligned);
return 0;
}