From ea311465a1e818ed90c300b8bff2c7d51811b59c Mon Sep 17 00:00:00 2001
From: Yingchi Long <longyingchi24s@ict.ac.cn>
Date: Mon, 17 Mar 2025 20:57:27 +0800
Subject: [PATCH] init

---
 Makefile     | 25 +++++++++++++++++++++++++
 unittest.cpp | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 x264_sum.ll  | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+)
 create mode 100644 Makefile
 create mode 100644 unittest.cpp
 create mode 100644 x264_sum.ll
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4fdc7c9
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,25 @@
+all: unittest_scalar unittest_vector unittest_vector_unalign
+	./unittest_scalar
+	./unittest_vector
+	./unittest_vector_unalign
+
+unittest_scalar: unittest.o x264_scalar.o
+	$(CXX) -lstdc++ $(LDFLAGS) $^ -o $@
+
+unittest_vector: unittest.o x264_vector.o
+	$(CXX) -lstdc++ $(LDFLAGS) $^ -o $@
+
+unittest_vector_unalign: unittest.o x264_vector_unalign.o
+	$(CXX) -lstdc++ $(LDFLAGS) $^ -o $@
+
+unittest.o: unittest.cpp
+	$(CXX) $(CXXFLAGS) -c $^ -o $@
+
+x264_scalar.o: x264_sum.ll
+	$(CXX) $(CXXFLAGS) -c $^ -o $@
+
+x264_vector.o: x264_sum.ll
+	$(CXX) -msimd -mcpu=sw8a $(CXXFLAGS) -c $^ -o $@
+
+x264_vector_unalign.o: x264_sum.ll
+	$(CXX) -msimd -mcpu=sw8a -mllvm -sw64-allows-misaligned-memory-accesses $(CXXFLAGS) -c $^ -o $@
\ No newline at end of file
diff --git a/unittest.cpp b/unittest.cpp
new file mode 100644
index 0000000..1c6e211
--- /dev/null
+++ b/unittest.cpp
@@ -0,0 +1,50 @@
+#include <iostream>
+#include <cassert>
+#include <chrono>
+#include <cstdint> // 使用 std::uint8_t 替代用户定义的 uint8_t
+#include <limits>
+
+// 声明外部实现的 SAD 函数（不包含函数体）
+extern "C" {
+    int x264_pixel_sad_16x16(uint8_t *pix1, int i_stride_pix1,
+                             uint8_t *pix2, int i_stride_pix2);
+}
+
+// 单元测试函数（保持不变）
+void test_sad() {
+    // ...（与之前相同）
+}
+
+// 改进的计时函数（多次执行取平均）
+void measure_time(int iterations = 1000) {
+    // 准备测试数据（可填充任意数据）
+    uint8_t pix1[16 * 16] = {0};
+    uint8_t pix2[16 * 16] = {0};
+
+    // 避免编译器优化掉循环
+    int result_sum = 0;
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < iterations; ++i) {
+        result_sum += x264_pixel_sad_16x16(pix1, 16, pix2, 16);
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+
+    // 计算时间差（微秒）
+    auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+    auto avg_duration = total_duration / static_cast<double>(iterations);
+    std::cout << "Total Execution Time for " << iterations << " iterations: " 
+              << total_duration << " μs" << std::endl;
+    std::cout << "Average Execution Time: " << avg_duration << " μs" << std::endl;
+    std::cout << "Total Result Sum: " << result_sum << std::endl; // 验证结果未被优化掉
+}
+
+int main() {
+    // 运行单元测试
+    test_sad();
+    std::cout << "All tests passed!" << std::endl;
+
+    // 执行计时（默认 1000 次迭代）
+    measure_time();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/x264_sum.ll b/x264_sum.ll
new file mode 100644
index 0000000..7bf89ed
--- /dev/null
+++ b/x264_sum.ll
@@ -0,0 +1,32 @@
+define dso_local noundef i32 @x264_pixel_sad_16x16(ptr %0, i32 noundef %1, ptr %2, i32 noundef %3) {
+  %5 = sext i32 %1 to i64
+  %6 = sext i32 %3 to i64
+  br label %7
+
+7:
+  %8 = phi i32 [ 0, %4 ], [ %23, %7 ]
+  %9 = phi i32 [ 0, %4 ], [ %20, %7 ]
+  %10 = phi ptr [ %0, %4 ], [ %21, %7 ]
+  %11 = phi ptr [ %2, %4 ], [ %22, %7 ]
+  %12 = load <16 x i8>, ptr %10, align 1
+  %13 = zext <16 x i8> %12 to <16 x i16>
+  %14 = load <16 x i8>, ptr %11, align 1
+  %15 = zext <16 x i8> %14 to <16 x i16>
+  %16 = sub nsw <16 x i16> %13, %15
+  %17 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %16, i1 false)
+  %18 = zext <16 x i16> %17 to <16 x i32>
+  %19 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %18)
+  %20 = add i32 %19, %9
+  %21 = getelementptr inbounds i8, ptr %10, i64 %5
+  %22 = getelementptr inbounds i8, ptr %11, i64 %6
+  %23 = add nuw nsw i32 %8, 1
+  %24 = icmp eq i32 %23, 16
+  br i1 %24, label %25, label %7
+
+25:
+  ret i32 %20
+}
+
+declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1 immarg)
+
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
\ No newline at end of file