我写了一个小基准来比较Python,Ruby,JavaScript和C++的不同解释器/编译器的性能.正如预期的那样,事实证明(优化的)C++胜过脚本语言,但它所做的因素非常高.
结果是:
sven@jet:~/tmp/js$ time node bla.js # * JavaScript with node * 0 real 0m1.222s user 0m1.190s sys 0m0.015s sven@jet:~/tmp/js$ time ruby foo.rb # * Ruby * 0 real 0m52.428s user 0m52.395s sys 0m0.028s sven@jet:~/tmp/js$ time python blub.py # * Python with CPython * 0 real 1m16.480s user 1m16.371s sys 0m0.080s sven@jet:~/tmp/js$ time pypy blub.py # * Python with PyPy * 0 real 0m4.707s user 0m4.579s sys 0m0.028s sven@jet:~/tmp/js$ time ./cpp_non_optimized 1000 1000000 # * C++ with -O0 (gcc) * 0 real 0m1.702s user 0m1.699s sys 0m0.002s sven@jet:~/tmp/js$ time ./cpp_optimized 1000 1000000 # * C++ with -O3 (gcc) * 0 real 0m0.003s # (!!!) <---------------------------------- WHY? user 0m0.002s sys 0m0.002s
我想知道是否有人可以解释为什么优化的C++代码比其他所有代码快三个数量级.
C++基准测试使用命令行参数,以防止在编译时预先计算结果.
下面,我放置了不同语言基准的源代码,这些基准应该在语义上等效.另外,我提供了优化的C++编译器输出的汇编代码(使用gcc).在查看优化的程序集时,似乎编译器将基准测试中的两个循环合并为一个循环,但是,仍然存在循环!
JavaScript的:
var s = 0; var outer = 1000; var inner = 1000000; for (var i = 0; i < outer; ++i) { for (var j = 0; j < inner; ++j) { ++s; } s -= inner; } console.log(s);
蟒蛇:
s = 0 outer = 1000 inner = 1000000 for _ in xrange(outer): for _ in xrange(inner): s += 1 s -= inner print s
红宝石:
s = 0 outer = 1000 inner = 1000000 outer_end = outer - 1 inner_end = inner - 1 for i in 0..outer_end for j in 0..inner_end s = s + 1 end s = s - inner end puts s
C++:
#include#include #include int main(int argc, char* argv[]) { uint32_t s = 0; uint32_t outer = atoi(argv[1]); uint32_t inner = atoi(argv[2]); for (uint32_t i = 0; i < outer; ++i) { for (uint32_t j = 0; j < inner; ++j) ++s; s -= inner; } std::cout << s << std::endl; return 0; }
汇编(使用gcc -S -O3 -std = c ++ 0x编译上述C++代码时):
.file "bar.cpp" .section .text.startup,"ax",@progbits .p2align 4,,15 .globl main .type main, @function main: .LFB1266: .cfi_startproc pushq %r12 .cfi_def_cfa_offset 16 .cfi_offset 12, -16 movl $10, %edx movq %rsi, %r12 pushq %rbp .cfi_def_cfa_offset 24 .cfi_offset 6, -24 pushq %rbx .cfi_def_cfa_offset 32 .cfi_offset 3, -32 movq 8(%rsi), %rdi xorl %esi, %esi call strtol movq 16(%r12), %rdi movq %rax, %rbp xorl %esi, %esi movl $10, %edx call strtol testl %ebp, %ebp je .L6 movl %ebp, %ebx xorl %eax, %eax xorl %edx, %edx .p2align 4,,10 .p2align 3 .L3: # <--- Here is the loop addl $1, %eax # <--- cmpl %eax, %ebx # <--- ja .L3 # <--- .L2: movl %edx, %esi movl $_ZSt4cout, %edi call _ZNSo9_M_insertImEERSoT_ movq %rax, %rdi call _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_ popq %rbx .cfi_remember_state .cfi_def_cfa_offset 24 popq %rbp .cfi_def_cfa_offset 16 xorl %eax, %eax popq %r12 .cfi_def_cfa_offset 8 ret .L6: .cfi_restore_state xorl %edx, %edx jmp .L2 .cfi_endproc .LFE1266: .size main, .-main .p2align 4,,15 .type _GLOBAL__sub_I_main, @function _GLOBAL__sub_I_main: .LFB1420: .cfi_startproc subq $8, %rsp .cfi_def_cfa_offset 16 movl $_ZStL8__ioinit, %edi call _ZNSt8ios_base4InitC1Ev movl $__dso_handle, %edx movl $_ZStL8__ioinit, %esi movl $_ZNSt8ios_base4InitD1Ev, %edi addq $8, %rsp .cfi_def_cfa_offset 8 jmp __cxa_atexit .cfi_endproc .LFE1420: .size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main .section .init_array,"aw" .align 8 .quad _GLOBAL__sub_I_main .local _ZStL8__ioinit .comm _ZStL8__ioinit,1,1 .hidden __dso_handle .ident "GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2" .section .note.GNU-stack,"",@progbits
ecatmur.. 102
优化器已经确定内部循环和后续行是无操作,并将其消除.不幸的是,它还没有设法消除外环.
请注意,node.js示例比未优化的C++示例更快,表明V8(节点的JIT编译器)已设法消除至少一个循环.但是,它的优化有一些开销,因为(像任何JIT编译器一样)它必须平衡优化和配置文件引导的重新优化的机会与这样做的成本.