我的目标是在
Python中为频谱有限元编写一个小型库,为此我尝试使用Boost扩展python和C库,希望它能使我的代码更快.
class Quad {
public:
Quad(int, int);
double integrate(boost::function const&)> const&);
double integrate_wrapper(boost::python::object const&);
std::vector nodes;
std::vector weights;
};
...
namespace std {
typedef std::vector > cube;
typedef std::vector mat;
typedef std::vector vec;
}
...
double Quad::integrate(boost::function const& func) {
double result = 0.;
for (unsigned int i = 0; i result += func(nodes[i]) * weights[i];
}
return result;
}
// ---- PYTHON WRAPPER ----
double Quad::integrate_wrapper(boost::python::object const& func) {
std::function lambda;
switch (this->nodes[0].size()) {
case 1: lambda = [&func](vec const& v) -> double { return boost::python::extract(func (v[0])); }; break;
case 2: lambda = [&func](vec const& v) -> double { return boost::python::extract(func(v[0], v[1])); }; break;
case 3: lambda = [&func](vec const& v) -> double { return boost::python::extract(func(v[0], v[1], v[2])); }; break;
default: cout <<"Dimension must be 1, 2, or 3" <}
return integrate(lambda);
}
// ---- EXPOSE TO PYTHON ----
BOOST_PYTHON_MODULE(hermite)
{
using namespace boost::python;
class_("double_vector")
.def(vector_indexing_suite())
;
class_("double_mat")
.def(vector_indexing_suite())
;
class_("Quad", init())
.def("integrate", &Quad::integrate_wrapper)
.def_readonly("nodes", &Quad::nodes)
.def_readonly("weights", &Quad::weights)
;
}
我比较了三种不同方法的性能来计算两个函数的积分.这两个功能是&#xff1a;
>函数f1(x,y,z)&#61; x * x
>一个更难评估的函数&#xff1a;f2(x,y,z)&#61; np.cos(2 * x 2 * y 2 * z)x * y np.exp(-z * z)np.cos(2 * x 2 * y 2 * z)x * y np.exp(-z * z)np.cos(2 * x 2 * y 2 * z)x * y np.exp(-z * z)np.cos (2 * x 2 * y 2 * z)x * y np.exp(-z * z)np.cos(2 * x 2 * y 2 * z)x * y np.exp(-z * z)
使用的方法是&#xff1a;
>从C程序调用库&#xff1a;
double func(vector v) {
return F1_OR_F2;
}
int main() {
hermite::Quad quadrature(100, 3);
double result &#61; quadrature.integrate(func);
cout <<"Result &#61; " <}
>从Python脚本调用库&#xff1a;
import hermite
def function(x, y, z): return F1_OR_F2
my_quad &#61; hermite.Quad(100, 3)
result &#61; my_quad.integrate(function)
>在Python中使用for循环&#xff1a;
import hermite
def function(x, y, z): return F1_OR_F2
my_quad &#61; hermite.Quad(100, 3)
weights &#61; my_quad.weights
nodes &#61; my_quad.nodes
result &#61; 0.
for i in range(len(weights)):
result &#43;&#61; weights[i] * function(nodes[i][0], nodes[i][1], nodes[i][2])
以下是每种方法的执行时间(使用方法1的time命令测量时间,方法2和3使用python模块时间,使用Cmake和set编译C代码(CMAKE_BUILD_TYPE Release))
>对于f1&#xff1a;
>方法1&#xff1a;0.07s用户0.01s系统99&#xff05;cpu 0.083总计
>方法2&#xff1a;0.19s
>方法3&#xff1a;3.06s
>对于f2&#xff1a;
>方法1&#xff1a;0.28s用户0.01s系统99&#xff05;cpu 0.289总计
>方法2&#xff1a;12.47s
>方法3&#xff1a;16.31s
根据这些结果,我的问题如下&#xff1a;
>为什么第一种方法比第二种方法快得多&#xff1f;
>可以改进python包装器以达到方法1和方法2之间相当的性能吗&#xff1f;
>为什么方法2比方法3更难以集成函数的难度&#xff1f;
编辑&#xff1a;我还尝试定义一个接受字符串作为参数的函数,将其写入文件,然后继续编译文件并动态加载生成的.so文件&#xff1a;
double Quad::integrate_from_string(string const& function_body) {
// Write function to file
ofstream helper_file;
helper_file.open("/tmp/helper_function.cpp");
helper_file <<"#include \n#include \n";
helper_file <<"extern \"C\" double toIntegrate(std::vector v) {\n";
helper_file <<" return " <helper_file.close();
// Compile file
system("c&#43;&#43; /tmp/helper_function.cpp -o /tmp/helper_function.so -shared -fPIC");
// Load function dynamically
typedef double (*vec_func)(vec);
void *function_so &#61; dlopen("/tmp/helper_function.so", RTLD_NOW);
vec_func func &#61; (vec_func) dlsym(function_so, "toIntegrate");
double result &#61; integrate(func);
dlclose(function_so);
return result;
}
它非常脏,可能不太便携,所以我很乐意找到一个更好的解决方案,但它运行良好,并且与sympy的ccode功能很好地配合.
第二次编辑我用纯粹的Python使用Numpy重写了这个函数.
import numpy as np
import numpy.polynomial.hermite_e as herm
import time
def integrate(function, degrees):
dim &#61; len(degrees)
nodes_multidim &#61; []
weights_multidim &#61; []
for i in range(dim):
nodes_1d, weights_1d &#61; herm.hermegauss(degrees[i])
nodes_multidim.append(nodes_1d)
weights_multidim.append(weights_1d)
grid_nodes &#61; np.meshgrid(*nodes_multidim)
grid_weights &#61; np.meshgrid(*weights_multidim)
nodes_flattened &#61; []
weights_flattened &#61; []
for i in range(dim):
nodes_flattened.append(grid_nodes[i].flatten())
weights_flattened.append(grid_weights[i].flatten())
nodes &#61; np.vstack(nodes_flattened)
weights &#61; np.prod(np.vstack(weights_flattened), axis&#61;0)
return np.dot(function(nodes), weights)
def function(v): return F1_OR_F2
result &#61; integrate(function, [100,100,100])
print("-> Result &#61; " &#43; str(result) &#43; ", Time &#61; " &#43; str(end-start))
有点令人惊讶(至少对我而言),此方法与纯C实现之间的性能没有显着差异.特别是,f1需要0.059s,f2需要0.36s.