This commit is contained in:
2025-03-17 20:57:27 +08:00
commit ea311465a1
3 changed files with 107 additions and 0 deletions

25
Makefile Normal file
View File

@@ -0,0 +1,25 @@
all: unittest_scalar unittest_vector unittest_vector_unalign
./unittest_scalar
./unittest_vector
./unittest_vector_unalign
unittest_scalar: unittest.o x264_scalar.o
$(CXX) -lstdc++ $(LDFLAGS) $^ -o $@
unittest_vector: unittest.o x264_vector.o
$(CXX) -lstdc++ $(LDFLAGS) $^ -o $@
unittest_vector_unalign: unittest.o x264_vector_unalign.o
$(CXX) -lstdc++ $(LDFLAGS) $^ -o $@
unittest.o: unittest.cpp
$(CXX) $(CXXFLAGS) -c $^ -o $@
x264_scalar.o: x264_sum.ll
$(CXX) $(CXXFLAGS) -c $^ -o $@
x264_vector.o: x264_sum.ll
$(CXX) -msimd -mcpu=sw8a $(CXXFLAGS) -c $^ -o $@
x264_vector_unalign.o: x264_sum.ll
$(CXX) -msimd -mcpu=sw8a -mllvm -sw64-allows-misaligned-memory-accesses $(CXXFLAGS) -c $^ -o $@

50
unittest.cpp Normal file
View File

@@ -0,0 +1,50 @@
#include <iostream>
#include <cassert>
#include <chrono>
#include <cstdint> // 使用 std::uint8_t 替代用户定义的 uint8_t
#include <limits>
// 声明外部实现的 SAD 函数(不包含函数体)
extern "C" {
int x264_pixel_sad_16x16(uint8_t *pix1, int i_stride_pix1,
uint8_t *pix2, int i_stride_pix2);
}
// 单元测试函数(保持不变)
void test_sad() {
// ...(与之前相同)
}
// 改进的计时函数(多次执行取平均)
void measure_time(int iterations = 1000) {
// 准备测试数据(可填充任意数据)
uint8_t pix1[16 * 16] = {0};
uint8_t pix2[16 * 16] = {0};
// 避免编译器优化掉循环
int result_sum = 0;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i) {
result_sum += x264_pixel_sad_16x16(pix1, 16, pix2, 16);
}
auto end = std::chrono::high_resolution_clock::now();
// 计算时间差(微秒)
auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
auto avg_duration = total_duration / static_cast<double>(iterations);
std::cout << "Total Execution Time for " << iterations << " iterations: "
<< total_duration << " μs" << std::endl;
std::cout << "Average Execution Time: " << avg_duration << " μs" << std::endl;
std::cout << "Total Result Sum: " << result_sum << std::endl; // 验证结果未被优化掉
}
int main() {
// 运行单元测试
test_sad();
std::cout << "All tests passed!" << std::endl;
// 执行计时(默认 1000 次迭代)
measure_time();
return 0;
}

32
x264_sum.ll Normal file
View File

@@ -0,0 +1,32 @@
define dso_local noundef i32 @x264_pixel_sad_16x16(ptr %0, i32 noundef %1, ptr %2, i32 noundef %3) {
%5 = sext i32 %1 to i64
%6 = sext i32 %3 to i64
br label %7
7:
%8 = phi i32 [ 0, %4 ], [ %23, %7 ]
%9 = phi i32 [ 0, %4 ], [ %20, %7 ]
%10 = phi ptr [ %0, %4 ], [ %21, %7 ]
%11 = phi ptr [ %2, %4 ], [ %22, %7 ]
%12 = load <16 x i8>, ptr %10, align 1
%13 = zext <16 x i8> %12 to <16 x i16>
%14 = load <16 x i8>, ptr %11, align 1
%15 = zext <16 x i8> %14 to <16 x i16>
%16 = sub nsw <16 x i16> %13, %15
%17 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %16, i1 false)
%18 = zext <16 x i16> %17 to <16 x i32>
%19 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %18)
%20 = add i32 %19, %9
%21 = getelementptr inbounds i8, ptr %10, i64 %5
%22 = getelementptr inbounds i8, ptr %11, i64 %6
%23 = add nuw nsw i32 %8, 1
%24 = icmp eq i32 %23, 16
br i1 %24, label %25, label %7
25:
ret i32 %20
}
declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1 immarg)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)