membound test init. L1 Cache Hit + Cache Miss testing
This commit is contained in:
17
Makefile
Normal file
17
Makefile
Normal file
@@ -0,0 +1,17 @@
|
||||
OBJS := loadop.o \
|
||||
load.o
|
||||
|
||||
CXXFLAGS := -O2 -fno-vectorize
|
||||
|
||||
all: $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -msimd -fuse-ld=lld $^ -o $@
|
||||
|
||||
loadop.o: loadop.ll
|
||||
$(CXX) $(CXXFLAGS) -msimd -c $^ -o $@
|
||||
|
||||
load.o: load.cpp
|
||||
$(CXX) $(CXXFLAGS) -msimd -c $^ -o $@
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f $(OBJS) all
|
||||
130
load.cpp
Normal file
130
load.cpp
Normal file
@@ -0,0 +1,130 @@
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include "loadop.h"
|
||||
|
||||
#define ITERATIONS 1000000
|
||||
#define CACHE_LINE_SIZE 64 // 典型缓存行大小
|
||||
#define L1_CACHE_SIZE (32*1024) // 假设L1缓存为32KB
|
||||
#define HUGE_SIZE (256*1024*1024) // 256MB大内存
|
||||
|
||||
// 测试缓存命中场景下的加载性能
|
||||
void test_cache_hit(uint8_t *base, size_t size, bool aligned) {
|
||||
volatile uint32_t sink = 0;
|
||||
|
||||
// 通过顺序访问所有缓存行来预热缓存
|
||||
for (size_t i = 0; i < size; i += CACHE_LINE_SIZE) {
|
||||
sink += *(base + i);
|
||||
}
|
||||
|
||||
// 测量延迟
|
||||
clock_t start = clock();
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
size_t offset = (i * CACHE_LINE_SIZE) % size; // 跳跃缓存行
|
||||
uint8_t *addr = base + offset;
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(addr));
|
||||
sink += *addr;
|
||||
}
|
||||
clock_t end = clock();
|
||||
double latency = (double)(end - start) * 1e9 / CLOCKS_PER_SEC / ITERATIONS;
|
||||
printf("[Cache Hit] %s-Load Latency: %.2f ns\n",
|
||||
aligned ? "Aligned" : "Unaligned", latency);
|
||||
|
||||
// 测量大量连续访问时的吞吐量
|
||||
start = clock();
|
||||
for (int i = 0; i < ITERATIONS; i += 4) { // 循环展开四次
|
||||
size_t offset1 = (i * CACHE_LINE_SIZE) % size;
|
||||
size_t offset2 = ((i+1) * CACHE_LINE_SIZE) % size;
|
||||
size_t offset3 = ((i+2) * CACHE_LINE_SIZE) % size;
|
||||
size_t offset4 = ((i+3) * CACHE_LINE_SIZE) % size;
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(base + offset1));
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(base + offset2));
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(base + offset3));
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(base + offset4));
|
||||
sink += *(base + offset1) + *(base + offset2);
|
||||
}
|
||||
end = clock();
|
||||
double throughput = (double)(end - start) * 1e9 /
|
||||
CLOCKS_PER_SEC / (ITERATIONS);
|
||||
printf("[Cache Hit] %s-Load Throughput: %.2f ns/op\n\n",
|
||||
aligned ? "Aligned" : "Unaligned", throughput);
|
||||
}
|
||||
|
||||
// 测试缓存未命中场景下的加载性能
|
||||
void test_cache_miss(uint8_t *base, size_t size, bool aligned) {
|
||||
volatile uint32_t sink = 0;
|
||||
|
||||
// 使用随机访问模式避免硬件预取优化
|
||||
size_t *offsets = new size_t[ITERATIONS];
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
offsets[i] = (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
|
||||
}
|
||||
|
||||
// 测量延迟
|
||||
clock_t start = clock();
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
uint8_t *addr = base + offsets[i];
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(addr));
|
||||
sink += *addr;
|
||||
}
|
||||
clock_t endt = clock();
|
||||
double latency = (double)(endt - start) * 1e9 / CLOCKS_PER_SEC / ITERATIONS;
|
||||
printf("[Cache Miss] %s-Load Latency: %.2f ns\n",
|
||||
aligned ? "Aligned" : "Unaligned", latency);
|
||||
|
||||
// 测量多路并发加载时的吞吐量
|
||||
start = clock();
|
||||
for (int i = 0; i < ITERATIONS; i += 4) {
|
||||
uint8_t *addr1 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
|
||||
uint8_t *addr2 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
|
||||
uint8_t *addr3 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
|
||||
uint8_t *addr4 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE;
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(addr1));
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(addr2));
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(addr3));
|
||||
load_v8i32(reinterpret_cast<uint64_t*>(addr4));
|
||||
sink += *addr1 + *addr2;
|
||||
}
|
||||
endt = clock();
|
||||
delete[] offsets;
|
||||
double throughput = (double)(endt - start) * 1e9 /
|
||||
CLOCKS_PER_SEC / (ITERATIONS);
|
||||
printf("[Cache Miss] %s-Load Throughput: %.2f ns/op\n\n",
|
||||
aligned ? "Aligned" : "Unaligned", throughput);
|
||||
}
|
||||
|
||||
int main() {
|
||||
// 分配小内存(确保全在L1缓存)
|
||||
uint8_t *small_aligned = (uint8_t*)aligned_alloc(CACHE_LINE_SIZE, L1_CACHE_SIZE);
|
||||
uint8_t *small_unaligned = small_aligned + 1; // 故意偏移1字节造非对齐
|
||||
|
||||
// 分配大内存(触发持续Cache Miss)
|
||||
uint8_t *large_aligned = (uint8_t*)aligned_alloc(CACHE_LINE_SIZE, HUGE_SIZE);
|
||||
uint8_t *large_unaligned = large_aligned + 1;
|
||||
|
||||
// 初始化数据
|
||||
for (size_t i = 0; i < L1_CACHE_SIZE; i++) small_aligned[i] = i % 256;
|
||||
for (size_t i = 0; i < HUGE_SIZE; i++) large_aligned[i] = i % 256;
|
||||
|
||||
// 测试缓存命中对齐访问
|
||||
printf("-- L1 Cache Hit with Aligned Access --\n");
|
||||
test_cache_hit(small_aligned, L1_CACHE_SIZE, true);
|
||||
|
||||
// 测试缓存命中非对齐访问
|
||||
printf("-- L1 Cache Hit with Unaligned Access --\n");
|
||||
test_cache_hit(small_unaligned, L1_CACHE_SIZE, false);
|
||||
|
||||
// 测试缓存未命中对齐访问
|
||||
printf("-- Cache Miss with Aligned Access --\n");
|
||||
test_cache_miss(large_aligned, HUGE_SIZE, true);
|
||||
|
||||
// 测试缓存未命中非对齐访问
|
||||
printf("-- Cache Miss with Unaligned Access --\n");
|
||||
test_cache_miss(large_unaligned, HUGE_SIZE, false);
|
||||
|
||||
free(small_aligned);
|
||||
free(large_aligned);
|
||||
return 0;
|
||||
}
|
||||
10
loadop.h
Normal file
10
loadop.h
Normal file
@@ -0,0 +1,10 @@
|
||||
typedef long longv4 __attribute__ ((vector_size(32)));
|
||||
|
||||
extern "C" {
|
||||
|
||||
[[gnu::always_inline]] longv4 load_v4i64(void *ptr);
|
||||
|
||||
|
||||
[[gnu::always_inline]] longv4 load_v8i32(void *ptr);
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user