设计行情 SDK,针对不同的回调函数实现方式,进行了一次耗时的测试。近期在看 C++ 函数编程,当函数变成了一等公民,在程序内部流转,耗时有什么不同?
前文链接:编译器、回调函数、性能测试
leimao
大佬刚好也做了类似的测试,借代码一用。
正文
执行平台依旧是我们的老朋友,https://wandbox.org/
#include <cassert>
#include <chrono>
#include <functional>
#include <iostream>
#include <vector>
int add_one(int input) { return input + 1; }
bool validate_vector_add_one(std::vector<int> const& input_vector,
std::vector<int> const& output_vector)
{
bool is_valid{true};
for (size_t i{0}; i < input_vector.size(); ++i)
{
if (output_vector.at(i) != input_vector.at(i) + 1)
{
is_valid = false;
break;
}
}
return is_valid;
}
void reset_vector(std::vector<int>& input_vector)
{
for (size_t i{0}; i < input_vector.size(); ++i)
{
input_vector.at(i) = 0;
}
}
template <typename T, typename Func>
void unitary_function_pass_by_lambda_function(T& output, T const& input,
Func const func)
{
output = func(input);
}
template <typename T>
void unitary_function_pass_by_std_function_value(T& output, T const& input,
std::function<T(T)> const func)
{
output = func(input);
}
template <typename T>
void unitary_function_pass_by_std_function_reference(
T& output, T const& input, std::function<T(T)> const& func)
{
output = func(input);
}
template <typename T>
void unitary_function_pass_by_function_pointer(T& output, T const& input,
T (*func)(T))
{
output = func(input);
}
int main()
{
// Set floating point format std::cout with 3 decimal places.
std::cout.precision(3);
size_t const num_elements{10000000};
std::vector<int> input_vector(num_elements, 0);
std::vector<int> output_vector(num_elements, 0);
auto const lambda_function_add_one{[](int const& input) -> int
{ return input + 1; }};
std::function<int(int)> const std_function_add_one{lambda_function_add_one};
std::cout << "The size of a function pointer: " << sizeof(&add_one)
<< std::endl;
std::cout << "The size of a std::function pointer: "
<< sizeof(&std_function_add_one) << std::endl;
std::cout << "The size of a std::function: " << sizeof(std_function_add_one)
<< std::endl;
// Call function frequently in a vanilla way.
// The compiler knows what function to call at compile time and can optimize
// the code.
// This is the best performance we could get.
std::chrono::steady_clock::time_point const time_start_vanilla{
std::chrono::steady_clock::now()};
for (size_t i{0}; i < num_elements; ++i)
{
output_vector.at(i) = add_one(input_vector.at(i));
}
std::chrono::steady_clock::time_point const time_end_vanilla{
std::chrono::steady_clock::now()};
auto const time_elapsed_vanilla{
std::chrono::duration_cast<std::chrono::nanoseconds>(time_end_vanilla -
time_start_vanilla)
.count()};
float const latency_vanilla{time_elapsed_vanilla /
static_cast<float>(num_elements)};
std::cout << "Latency Pass Vanilla: " << latency_vanilla << " ns"
<< std::endl;
assert(validate_vector_add_one(input_vector, output_vector));
reset_vector(output_vector);
// Sometimes, we don't know what function to call at compile time.
// We can use std::function to pass a function as an argument.
// In this case, we pass the std::function by value.
// Because the size of a std::function is 32 bytes, passing by value
// results in a lot of copying and bad performance.
std::chrono::steady_clock::time_point const
time_start_pass_by_std_function_value{std::chrono::steady_clock::now()};
for (size_t i{0}; i < num_elements; ++i)
{
unitary_function_pass_by_std_function_value(
output_vector.at(i), input_vector.at(i), std_function_add_one);
}
std::chrono::steady_clock::time_point const
time_end_pass_by_std_function_value{std::chrono::steady_clock::now()};
auto const time_elapsed_pass_by_std_function_value{
std::chrono::duration_cast<std::chrono::nanoseconds>(
time_end_pass_by_std_function_value -
time_start_pass_by_std_function_value)
.count()};
float const latency_pass_by_std_function_value{
time_elapsed_pass_by_std_function_value /
static_cast<float>(num_elements)};
std::cout << "Latency Pass By Std Function Value: "
<< latency_pass_by_std_function_value << " ns" << std::endl;
assert(validate_vector_add_one(input_vector, output_vector));
reset_vector(output_vector);
// Instead of passing the std::function by value, we can pass it by
// reference (pointer). In this case, object copying is eliminated. The
// performance is better than passing the std::function by value. However,
// the performance is still not as good as the vanilla way.
std::chrono::steady_clock::time_point const
time_start_pass_by_std_function_reference{
std::chrono::steady_clock::now()};
for (size_t i{0}; i < num_elements; ++i)
{
unitary_function_pass_by_std_function_reference(
output_vector.at(i), input_vector.at(i), std_function_add_one);
}
std::chrono::steady_clock::time_point const
time_end_pass_by_std_function_reference{
std::chrono::steady_clock::now()};
auto const time_elapsed_pass_by_std_function_reference{
std::chrono::duration_cast<std::chrono::nanoseconds>(
time_end_pass_by_std_function_reference -
time_start_pass_by_std_function_reference)
.count()};
float const latency_pass_by_std_function_reference{
time_elapsed_pass_by_std_function_reference /
static_cast<float>(num_elements)};
std::cout << "Latency Pass By Std Function Reference: "
<< latency_pass_by_std_function_reference << " ns" << std::endl;
assert(validate_vector_add_one(input_vector, output_vector));
reset_vector(output_vector);
// std::function is a general purpose wrapper for function pointers,
// callable objects, and lambda functions. Because it's general purpose,
// it's not as efficient as a function pointer. In this case, we pass a
// function pointer to a function. The performance is better than passing
// the std::function by reference.
std::chrono::steady_clock::time_point const
time_start_pass_by_function_pointer{std::chrono::steady_clock::now()};
for (size_t i{0}; i < num_elements; ++i)
{
unitary_function_pass_by_function_pointer(output_vector.at(i),
input_vector.at(i), &add_one);
}
std::chrono::steady_clock::time_point const
time_end_pass_by_function_pointer{std::chrono::steady_clock::now()};
auto const time_elapsed_pass_by_function_pointer{
std::chrono::duration_cast<std::chrono::nanoseconds>(
time_end_pass_by_function_pointer -
time_start_pass_by_function_pointer)
.count()};
float const latency_pass_by_function_pointer{
time_elapsed_pass_by_function_pointer /
static_cast<float>(num_elements)};
std::cout << "Latency Pass By Function Pointer: "
<< latency_pass_by_function_pointer << " ns" << std::endl;
assert(validate_vector_add_one(input_vector, output_vector));
reset_vector(output_vector);
// We can also pass a lambda function to a function.
// The compiler knows what function to call at compile time and can optimize
// the code. The performance is also better than passing the std::function
// by reference.
std::chrono::steady_clock::time_point const
time_start_pass_by_lambda_function{std::chrono::steady_clock::now()};
for (size_t i{0}; i < num_elements; ++i)
{
unitary_function_pass_by_lambda_function(
output_vector.at(i), input_vector.at(i), lambda_function_add_one);
}
std::chrono::steady_clock::time_point const
time_end_pass_by_lambda_function{std::chrono::steady_clock::now()};
auto const time_elapsed_pass_by_lambda_function{
std::chrono::duration_cast<std::chrono::nanoseconds>(
time_end_pass_by_lambda_function -
time_start_pass_by_lambda_function)
.count()};
float const latency_pass_by_lambda_function{
time_elapsed_pass_by_lambda_function /
static_cast<float>(num_elements)};
std::cout << "Latency Pass By Lambda Function: "
<< latency_pass_by_lambda_function << " ns" << std::endl;
assert(validate_vector_add_one(input_vector, output_vector));
reset_vector(output_vector);
}
# 组里常规也就开启 O2 优化,编译选用了 gcc13,不同版本的 gcc 性能耗时略有不同,版本越高 lambda 效果越好
The size of a function pointer: 8
The size of a std::function pointer: 8
The size of a std::function: 32
Latency Pass Vanilla: 0.418 ns
Latency Pass By Std Function Value: 3.47 ns
Latency Pass By Std Function Reference: 1.36 ns
Latency Pass By Function Pointer: 0.396 ns
Latency Pass By Lambda Function: 0.44 ns
参考资料
https://leimao.github.io/blog/CPP-Function-Call-Performance/