commit 2bc5bd7379406ccb413b30769c0dc27a84f419b6 Author: Yingchi Long Date: Fri Feb 21 12:50:08 2025 +0800 membound test init. L1 Cache Hit + Cache Miss testing diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5464b8d --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +OBJS := loadop.o \ + load.o + +CXXFLAGS := -O2 -fno-vectorize + +all: $(OBJS) + $(CXX) $(CXXFLAGS) -msimd -fuse-ld=lld $^ -o $@ + +loadop.o: loadop.ll + $(CXX) $(CXXFLAGS) -msimd -c $^ -o $@ + +load.o: load.cpp + $(CXX) $(CXXFLAGS) -msimd -c $^ -o $@ + +.PHONY: clean +clean: + rm -f $(OBJS) all \ No newline at end of file diff --git a/load.cpp b/load.cpp new file mode 100644 index 0000000..368f990 --- /dev/null +++ b/load.cpp @@ -0,0 +1,130 @@ +#include +#include +#include +#include +#include +#include "loadop.h" + +#define ITERATIONS 1000000 +#define CACHE_LINE_SIZE 64 // 典型缓存行大小 +#define L1_CACHE_SIZE (32*1024) // 假设L1缓存为32KB +#define HUGE_SIZE (256*1024*1024) // 256MB大内存 + +// 测试缓存命中场景下的加载性能 +void test_cache_hit(uint8_t *base, size_t size, bool aligned) { + volatile uint32_t sink = 0; + + // 通过顺序访问所有缓存行来预热缓存 + for (size_t i = 0; i < size; i += CACHE_LINE_SIZE) { + sink += *(base + i); + } + + // 测量延迟 + clock_t start = clock(); + for (int i = 0; i < ITERATIONS; i++) { + size_t offset = (i * CACHE_LINE_SIZE) % size; // 跳跃缓存行 + uint8_t *addr = base + offset; + load_v8i32(reinterpret_cast(addr)); + sink += *addr; + } + clock_t end = clock(); + double latency = (double)(end - start) * 1e9 / CLOCKS_PER_SEC / ITERATIONS; + printf("[Cache Hit] %s-Load Latency: %.2f ns\n", + aligned ? "Aligned" : "Unaligned", latency); + + // 测量大量连续访问时的吞吐量 + start = clock(); + for (int i = 0; i < ITERATIONS; i += 4) { // 循环展开四次 + size_t offset1 = (i * CACHE_LINE_SIZE) % size; + size_t offset2 = ((i+1) * CACHE_LINE_SIZE) % size; + size_t offset3 = ((i+2) * CACHE_LINE_SIZE) % size; + size_t offset4 = ((i+3) * CACHE_LINE_SIZE) % size; + load_v8i32(reinterpret_cast(base + offset1)); + load_v8i32(reinterpret_cast(base + offset2)); + load_v8i32(reinterpret_cast(base + offset3)); + load_v8i32(reinterpret_cast(base + offset4)); + sink += *(base + offset1) + *(base + offset2); + } + end = clock(); + double throughput = (double)(end - start) * 1e9 / + CLOCKS_PER_SEC / (ITERATIONS); + printf("[Cache Hit] %s-Load Throughput: %.2f ns/op\n\n", + aligned ? "Aligned" : "Unaligned", throughput); +} + +// 测试缓存未命中场景下的加载性能 +void test_cache_miss(uint8_t *base, size_t size, bool aligned) { + volatile uint32_t sink = 0; + + // 使用随机访问模式避免硬件预取优化 + size_t *offsets = new size_t[ITERATIONS]; + for (int i = 0; i < ITERATIONS; i++) { + offsets[i] = (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE; + } + + // 测量延迟 + clock_t start = clock(); + for (int i = 0; i < ITERATIONS; i++) { + uint8_t *addr = base + offsets[i]; + load_v8i32(reinterpret_cast(addr)); + sink += *addr; + } + clock_t endt = clock(); + double latency = (double)(endt - start) * 1e9 / CLOCKS_PER_SEC / ITERATIONS; + printf("[Cache Miss] %s-Load Latency: %.2f ns\n", + aligned ? "Aligned" : "Unaligned", latency); + + // 测量多路并发加载时的吞吐量 + start = clock(); + for (int i = 0; i < ITERATIONS; i += 4) { + uint8_t *addr1 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE; + uint8_t *addr2 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE; + uint8_t *addr3 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE; + uint8_t *addr4 = base + (rand() % (size / CACHE_LINE_SIZE)) * CACHE_LINE_SIZE; + load_v8i32(reinterpret_cast(addr1)); + load_v8i32(reinterpret_cast(addr2)); + load_v8i32(reinterpret_cast(addr3)); + load_v8i32(reinterpret_cast(addr4)); + sink += *addr1 + *addr2; + } + endt = clock(); + delete[] offsets; + double throughput = (double)(endt - start) * 1e9 / + CLOCKS_PER_SEC / (ITERATIONS); + printf("[Cache Miss] %s-Load Throughput: %.2f ns/op\n\n", + aligned ? "Aligned" : "Unaligned", throughput); +} + +int main() { + // 分配小内存(确保全在L1缓存) + uint8_t *small_aligned = (uint8_t*)aligned_alloc(CACHE_LINE_SIZE, L1_CACHE_SIZE); + uint8_t *small_unaligned = small_aligned + 1; // 故意偏移1字节造非对齐 + + // 分配大内存(触发持续Cache Miss) + uint8_t *large_aligned = (uint8_t*)aligned_alloc(CACHE_LINE_SIZE, HUGE_SIZE); + uint8_t *large_unaligned = large_aligned + 1; + + // 初始化数据 + for (size_t i = 0; i < L1_CACHE_SIZE; i++) small_aligned[i] = i % 256; + for (size_t i = 0; i < HUGE_SIZE; i++) large_aligned[i] = i % 256; + + // 测试缓存命中对齐访问 + printf("-- L1 Cache Hit with Aligned Access --\n"); + test_cache_hit(small_aligned, L1_CACHE_SIZE, true); + + // 测试缓存命中非对齐访问 + printf("-- L1 Cache Hit with Unaligned Access --\n"); + test_cache_hit(small_unaligned, L1_CACHE_SIZE, false); + + // 测试缓存未命中对齐访问 + printf("-- Cache Miss with Aligned Access --\n"); + test_cache_miss(large_aligned, HUGE_SIZE, true); + + // 测试缓存未命中非对齐访问 + printf("-- Cache Miss with Unaligned Access --\n"); + test_cache_miss(large_unaligned, HUGE_SIZE, false); + + free(small_aligned); + free(large_aligned); + return 0; +} diff --git a/loadop.h b/loadop.h new file mode 100644 index 0000000..ffdd7b2 --- /dev/null +++ b/loadop.h @@ -0,0 +1,10 @@ +typedef long longv4 __attribute__ ((vector_size(32))); + +extern "C" { + +[[gnu::always_inline]] longv4 load_v4i64(void *ptr); + + +[[gnu::always_inline]] longv4 load_v8i32(void *ptr); + +} \ No newline at end of file diff --git a/loadop.ll b/loadop.ll new file mode 100644 index 0000000..452d146 --- /dev/null +++ b/loadop.ll @@ -0,0 +1,11 @@ +target triple = "sw_64-sunway-linux-gnu" + +define <4 x i64> @load_v4i64(ptr %a) { + %x = load <4 x i64>, ptr %a + ret <4 x i64> %x +} + +define <8 x i32> @load_v8i32(ptr %a) { + %x = load <8 x i32>, ptr %a + ret <8 x i32> %x +} \ No newline at end of file