在C中调用函数时略有性能下降
我正在为我为博士提供的一些模拟代码优化以下目标:
- 使其模块化(创建libs并将代码分为小块)
- 使其至少具有与我的代码相同的性能运行已经有。
- 并行。
当前,我专注于步骤1和2。我正在使用Runge-Kutta第四阶方法集成非线性方程系统的代码,并将结果打印到输出文件中。此方法是我优化的所有其他分析和方法的核心。因此,每毫秒的运行时间都很重要,因为在某些情况下,我必须将这种方法称为数百万次。
我有两个我运行10次的版本:
- 在第一个版本中,我定义函数
duffing
来处理方程式的非线性系统和函数rk4
来处理1个步骤runge-kutta集成符:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *));
void duffing(int dim, double *x, double t, double *par, double *f);
int main(void) {
// Program Parameters
const double pi = 4 * atan(1);
int DIM = 2;
int nP = 1000;
int nDiv = 1000;
int nPar = 5;
// Parameters
double *par = malloc(nPar * sizeof *par);
double *f = malloc(DIM * sizeof *f);
double *x = malloc(DIM * sizeof *x);
par[0] = 1.0;
par[1] = 0.15;
par[2] = 0.01;
par[3] = -0.5;
par[4] = 0.5;
// Initial Conditions
double t = 0.0;
x[0] = 1.0;
x[1] = 0.0;
// Numerical Parameters
double h = (2 * pi) / (nDiv * par[0]);
// Create Output File
FILE *output = fopen("output.txt", "w");
// Time variables
double time_spent = 0.0;
clock_t time_i = clock();
// Solution
fprintf(output, "Time x[0] x[1]\n");
fprintf(output, "%.10lf %.10lf %.10lf\n", t, x[0], x[1]);
for (int i = 0; i < nP; i++) {
for (int j = 0; j < nDiv; j++) {
rk4(DIM, x, t, h, par, f, duffing);
t = t + h;
fprintf(output, "%.10lf %.10lf %.10lf\n", t, x[0], x[1]);
}
}
// Time Spent
clock_t time_f = clock();
time_spent += (double)(time_f - time_i) / CLOCKS_PER_SEC;
printf("The elapsed time is %f seconds\n", time_spent);
// Free Memory
free(par); free(f); free(x);
}
void duffing(int dim, double *x, double t, double *par, double *f) {
if (dim == 2) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
}
else if (dim == 6) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
for (int i = 0; i < 2; i++) {
f[2 + i] = x[4 + i];
f[4 + i] = -par[3]*x[2 + i] - 3*par[4]*x[0]*x[0]*x[2 + i] - 2*par[2]*x[4 + i];
}
} else {
printf("Wrong dimension (dim) or (ndim) allocated for system of equations\n");
exit(1);
}
}
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *)) {
double tmp[dim], k1[dim], k2[dim], k3[dim], k4[dim];
// Calculate first slope
edosys(dim, x, t, par, k1);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k1[i]);
}
// Calculate second slope
edosys(dim, tmp, t + 0.5 * h, par, k2);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k2[i]);
}
// Calculate third slope
edosys(dim, tmp, t + 0.5 * h, par, k3);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + h * k3[i];
}
// Calculate the fourth slope
edosys(dim, tmp, t + h, par, k4);
// Calculate the next value of x[dim]
for (int i = 0; i < dim; i++) {
x[i] = x[i] + (h/6.0) * (k1[i] + 2 * k2[i] + 2 * k3[i] + k4[i]);
}
}
每次运行的经过的时间都可以在下面看到:
The elapsed time is 0.718215 seconds
The elapsed time is 0.713928 seconds
The elapsed time is 0.705679 seconds
The elapsed time is 0.713959 seconds
The elapsed time is 0.707523 seconds
The elapsed time is 0.710903 seconds
The elapsed time is 0.708110 seconds
The elapsed time is 0.718513 seconds
The elapsed time is 0.706710 seconds
The elapsed time is 0.710024 seconds
- 第二个版本我定义了相同的
duffing
和rk4
与以前的功能,并添加了一个称为的第三个功能rk4_solution
处理方法的所有步骤:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *));
void duffing(int dim, double *x, double t, double *par, double *f);
void rk4_solution(FILE *output, double *x, double t, int dim, int np, int ndiv,
double *par, double *f,
void (*edosys)(int, double *, double, double *, double *));
int main(void) {
// Program Parameters
int DIM = 2;
int nP = 1000;
int nDiv = 1000;
int nPar = 5;
// Parameters
double *par = malloc(nPar * sizeof *par);
double *f = malloc(DIM * sizeof *f);
double *x = malloc(DIM * sizeof *x);
par[0] = 1.0;
par[1] = 0.15;
par[2] = 0.01;
par[3] = -0.5;
par[4] = 0.5;
// Initial Conditions
double t = 0.0;
x[0] = 1.0;
x[1] = 0.0;
// Create Output File
FILE *output = fopen("output.txt", "w");
// Time variables
double time_spent = 0.0;
clock_t time_i = clock();
// Solution
rk4_solution(output, x, t, DIM, nP, nDiv, par, f, duffing);
// Time Spent
clock_t time_f = clock();
time_spent += (double)(time_f - time_i) / CLOCKS_PER_SEC;
printf("The elapsed time is %f seconds\n", time_spent);
// Free Memory
free(par); free(f); free(x);
}
void duffing(int dim, double *x, double t, double *par, double *f) {
if (dim == 2) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
}
else if (dim == 6) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
for (int i = 0; i < 2; i++) {
f[2 + i] = x[4 + i];
f[4 + i] = -par[3]*x[2 + i] - 3*par[4]*x[0]*x[0]*x[2 + i] - 2*par[2]*x[4 + i];
}
} else {
printf("Wrong dimension (dim) or (ndim) allocated for system of equations\n");
exit(1);
}
}
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *)) {
double tmp[dim], k1[dim], k2[dim], k3[dim], k4[dim];
// Calculate first slope
edosys(dim, x, t, par, k1);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k1[i]);
}
// Calculate second slope
edosys(dim, tmp, t + 0.5 * h, par, k2);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k2[i]);
}
// Calculate third slope
edosys(dim, tmp, t + 0.5 * h, par, k3);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + h * k3[i];
}
// Calculate the fourth slope
edosys(dim, tmp, t + h, par, k4);
// Calculate the next value of x[dim]
for (int i = 0; i < dim; i++) {
x[i] = x[i] + (h/6.0) * (k1[i] + 2 * k2[i] + 2 * k3[i] + k4[i]);
}
}
void rk4_solution(FILE *output, double *x, double t, int dim, int np, int ndiv,
double *par, double *f,
void (*edosys)(int, double *, double, double *, double *)) {
const double pi = 4 * atan(1);
// Numerical Parameters
double h = (2 * pi) / (ndiv * par[0]);
fprintf(output, "Time x[0] x[1]\n");
fprintf(output, "%.10lf %.10lf %.10lf\n", t, x[0], x[1]);
for (int i = 0; i < np; i++) {
for (int j = 0; j < ndiv; j++) {
rk4(dim, x, t, h, par, f, edosys);
t = t + h;
fprintf(output, "%.10lf %.10lf %.10lf\n", t, x[0], x[1]);
}
}
}
通过添加第三个函数,运行时间稍微更改为:
The elapsed time is 0.753674 seconds
The elapsed time is 0.748255 seconds
The elapsed time is 0.738883 seconds
The elapsed time is 0.738666 seconds
The elapsed time is 0.736813 seconds
The elapsed time is 0.740047 seconds
The elapsed time is 0.736575 seconds
The elapsed time is 0.739985 seconds
The elapsed time is 0.737410 seconds
The elapsed time is 0.738836 seconds
有人可以帮助我理解为什么会发生这种情况以及如何避免这种情况,因为我基本上正在做这件事代码的两个版本中相同的操作?
这个结果对我来说非常令人担忧,因此我在稍微复杂的代码中测试了同一件事,这给了我20秒的平均差异。最后,它可能会影响我在几周内,甚至几个月内都会影响我最复杂的分析。
提前致谢!
I'm in the process of optimizing some simulation codes I made for my phD focusing on the following objectives:
- Make it modular (creating libs and dividing the code into small pieces)
- Make it run at least with the same performance as the codes that I already have.
- Parallelize.
Currently, I'm focused on the steps 1 and 2. I'm playing with a code that integrates a nonlinear system of equations by means of the Runge-Kutta 4th order method and prints the results into an output file. This method is the core of all my other analyses and methods that I'm optimizing. So, every millisecond of running time is important, as in some cases I have to call the method millions of times.
I have two versions that I ran 10 times:
- In the first version I define the function
duffing
to handle the nonlinear system of equations and the functionrk4
to handle 1 step of the Runge-Kutta integrator:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *));
void duffing(int dim, double *x, double t, double *par, double *f);
int main(void) {
// Program Parameters
const double pi = 4 * atan(1);
int DIM = 2;
int nP = 1000;
int nDiv = 1000;
int nPar = 5;
// Parameters
double *par = malloc(nPar * sizeof *par);
double *f = malloc(DIM * sizeof *f);
double *x = malloc(DIM * sizeof *x);
par[0] = 1.0;
par[1] = 0.15;
par[2] = 0.01;
par[3] = -0.5;
par[4] = 0.5;
// Initial Conditions
double t = 0.0;
x[0] = 1.0;
x[1] = 0.0;
// Numerical Parameters
double h = (2 * pi) / (nDiv * par[0]);
// Create Output File
FILE *output = fopen("output.txt", "w");
// Time variables
double time_spent = 0.0;
clock_t time_i = clock();
// Solution
fprintf(output, "Time x[0] x[1]\n");
fprintf(output, "%.10lf %.10lf %.10lf\n", t, x[0], x[1]);
for (int i = 0; i < nP; i++) {
for (int j = 0; j < nDiv; j++) {
rk4(DIM, x, t, h, par, f, duffing);
t = t + h;
fprintf(output, "%.10lf %.10lf %.10lf\n", t, x[0], x[1]);
}
}
// Time Spent
clock_t time_f = clock();
time_spent += (double)(time_f - time_i) / CLOCKS_PER_SEC;
printf("The elapsed time is %f seconds\n", time_spent);
// Free Memory
free(par); free(f); free(x);
}
void duffing(int dim, double *x, double t, double *par, double *f) {
if (dim == 2) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
}
else if (dim == 6) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
for (int i = 0; i < 2; i++) {
f[2 + i] = x[4 + i];
f[4 + i] = -par[3]*x[2 + i] - 3*par[4]*x[0]*x[0]*x[2 + i] - 2*par[2]*x[4 + i];
}
} else {
printf("Wrong dimension (dim) or (ndim) allocated for system of equations\n");
exit(1);
}
}
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *)) {
double tmp[dim], k1[dim], k2[dim], k3[dim], k4[dim];
// Calculate first slope
edosys(dim, x, t, par, k1);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k1[i]);
}
// Calculate second slope
edosys(dim, tmp, t + 0.5 * h, par, k2);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k2[i]);
}
// Calculate third slope
edosys(dim, tmp, t + 0.5 * h, par, k3);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + h * k3[i];
}
// Calculate the fourth slope
edosys(dim, tmp, t + h, par, k4);
// Calculate the next value of x[dim]
for (int i = 0; i < dim; i++) {
x[i] = x[i] + (h/6.0) * (k1[i] + 2 * k2[i] + 2 * k3[i] + k4[i]);
}
}
The elapsed time of each run can be seen below:
The elapsed time is 0.718215 seconds
The elapsed time is 0.713928 seconds
The elapsed time is 0.705679 seconds
The elapsed time is 0.713959 seconds
The elapsed time is 0.707523 seconds
The elapsed time is 0.710903 seconds
The elapsed time is 0.708110 seconds
The elapsed time is 0.718513 seconds
The elapsed time is 0.706710 seconds
The elapsed time is 0.710024 seconds
- The second version I defined the same
duffing
andrk4
functions as before, and added a third function calledrk4_solution
that handles all the steps of the method:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *));
void duffing(int dim, double *x, double t, double *par, double *f);
void rk4_solution(FILE *output, double *x, double t, int dim, int np, int ndiv,
double *par, double *f,
void (*edosys)(int, double *, double, double *, double *));
int main(void) {
// Program Parameters
int DIM = 2;
int nP = 1000;
int nDiv = 1000;
int nPar = 5;
// Parameters
double *par = malloc(nPar * sizeof *par);
double *f = malloc(DIM * sizeof *f);
double *x = malloc(DIM * sizeof *x);
par[0] = 1.0;
par[1] = 0.15;
par[2] = 0.01;
par[3] = -0.5;
par[4] = 0.5;
// Initial Conditions
double t = 0.0;
x[0] = 1.0;
x[1] = 0.0;
// Create Output File
FILE *output = fopen("output.txt", "w");
// Time variables
double time_spent = 0.0;
clock_t time_i = clock();
// Solution
rk4_solution(output, x, t, DIM, nP, nDiv, par, f, duffing);
// Time Spent
clock_t time_f = clock();
time_spent += (double)(time_f - time_i) / CLOCKS_PER_SEC;
printf("The elapsed time is %f seconds\n", time_spent);
// Free Memory
free(par); free(f); free(x);
}
void duffing(int dim, double *x, double t, double *par, double *f) {
if (dim == 2) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
}
else if (dim == 6) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
for (int i = 0; i < 2; i++) {
f[2 + i] = x[4 + i];
f[4 + i] = -par[3]*x[2 + i] - 3*par[4]*x[0]*x[0]*x[2 + i] - 2*par[2]*x[4 + i];
}
} else {
printf("Wrong dimension (dim) or (ndim) allocated for system of equations\n");
exit(1);
}
}
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *)) {
double tmp[dim], k1[dim], k2[dim], k3[dim], k4[dim];
// Calculate first slope
edosys(dim, x, t, par, k1);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k1[i]);
}
// Calculate second slope
edosys(dim, tmp, t + 0.5 * h, par, k2);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k2[i]);
}
// Calculate third slope
edosys(dim, tmp, t + 0.5 * h, par, k3);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + h * k3[i];
}
// Calculate the fourth slope
edosys(dim, tmp, t + h, par, k4);
// Calculate the next value of x[dim]
for (int i = 0; i < dim; i++) {
x[i] = x[i] + (h/6.0) * (k1[i] + 2 * k2[i] + 2 * k3[i] + k4[i]);
}
}
void rk4_solution(FILE *output, double *x, double t, int dim, int np, int ndiv,
double *par, double *f,
void (*edosys)(int, double *, double, double *, double *)) {
const double pi = 4 * atan(1);
// Numerical Parameters
double h = (2 * pi) / (ndiv * par[0]);
fprintf(output, "Time x[0] x[1]\n");
fprintf(output, "%.10lf %.10lf %.10lf\n", t, x[0], x[1]);
for (int i = 0; i < np; i++) {
for (int j = 0; j < ndiv; j++) {
rk4(dim, x, t, h, par, f, edosys);
t = t + h;
fprintf(output, "%.10lf %.10lf %.10lf\n", t, x[0], x[1]);
}
}
}
By adding the third function, the running times slightly changed to:
The elapsed time is 0.753674 seconds
The elapsed time is 0.748255 seconds
The elapsed time is 0.738883 seconds
The elapsed time is 0.738666 seconds
The elapsed time is 0.736813 seconds
The elapsed time is 0.740047 seconds
The elapsed time is 0.736575 seconds
The elapsed time is 0.739985 seconds
The elapsed time is 0.737410 seconds
The elapsed time is 0.738836 seconds
Could someone help me understanding why this is happening and how to avoid it as I'm doing basically the same operations in the two versions of the code?
This result was very concerning to me, so I tested the same thing in a slightly more complex code, and it gives me a 20 second difference in average. In the end, It can affect my most complex analyses in weeks, maybe even in months.
Thanks in advance!
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
data:image/s3,"s3://crabby-images/d5906/d59060df4059a6cc364216c4d63ceec29ef7fe66" alt="扫码二维码加入Web技术交流群"
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
使它们更加相同。
MAY/可能无法说明所有时间差异的想法:
不要为
rk4_solution()
在
k4_solution()中执行时间安排()在
main()
中。使用
限制
在
rk4_solution()
中,指向x,par,par,f
的数据可能重叠,感情的优化潜力而在main()
中,它们是不重叠的。这两种方法都有许多方法来增强性能,但是OP似乎想要识别性能差异。
Make them more the same.
Ideas with may/may not account for all time differences:
Do not time extra code in
rk4_solution()
Perform timing in
k4_solution()
like that inmain()
.Use
restrict
Within
rk4_solution()
, pointed to data ofx, par, f
may overlap, affectioning optimization potential whereas inmain()
they are known to not overlap.There are numerous ways to increate performance for both methods, yet OP here seems to want to identify performance differences.