C++中函数指针与函数内联优化的关联性 -

ppgunjack

浏览: 80430 次
性别:
来自: 上海

最近访客更多访客>>

tanxr

zyi74

yonghong

zhouyu0914k

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

C++中函数指针与函数内联优化的关联性

C C++C#GCC ITeye

c++对内联优化的处理是个很重要的知识点，对这个问题的考虑来自这个帖子：
http://www.iteye.com/topic/1055377，其中涉及的另一个链接http://blog.csdn.net/yongzhewuwei_2008/archive/2006/11/16/1387476.aspx，提到了Java在运行时对多态函数的内联优化。
在c++中通过基类指针调用的多态函数是无法被内联优化的，因为基类指针实际指向的对象是基类还是子类是在运行时才能确定的，因此是无法被内联化的。
需要注意的是，造成无法内联化的不是多态或者继承本身，根本原因是在于静态编译条件下对函数指针的调用无法定位到静态代码地址，因此无法将用函数指针来进行函数调用的地方用所调用代码内联化。
举个例子：

void fn1(){
}
void fn2(){
}
int main(){
    void (*pf)(void);

    for(int i=0;i<1000;i++) {
        if(i>20) {
            pf=&fn1;
        }else{
            pf=&fn2;
        }
        (*pf)();
    }
}

在gcc O0下生成的汇编如下：

main.o:     file format pe-i386


Disassembly of section .text:

00000000 <__Z3fn1v>:
   0:	55                   	push   %ebp
   1:	89 e5                	mov    %esp,%ebp
   3:	c9                   	leave  
   4:	c3                   	ret    

00000005 <__Z3fn2v>:
   5:	55                   	push   %ebp
   6:	89 e5                	mov    %esp,%ebp
   8:	c9                   	leave  
   9:	c3                   	ret    

0000000a <_main>:
   a:	55                   	push   %ebp
   b:	89 e5                	mov    %esp,%ebp
   d:	83 e4 f0             	and    $0xfffffff0,%esp
  10:	83 ec 10             	sub    $0x10,%esp
  13:	e8 00 00 00 00       	call   18 <_main+0xe>
  18:	c7 44 24 08 00 00 00 	movl   $0x0,0x8(%esp)		//初始化i
  1f:	00 
  20:	eb 23                	jmp    45 <_main+0x3b>
  22:	83 7c 24 08 14       	cmpl   $0x14,0x8(%esp)		//比较i和20大小
  27:	7e 0a                	jle    33 <_main+0x29>
  29:	c7 44 24 0c 00 00 00 	movl   $0x0,0xc(%esp)		//函数fn1地址赋予pf
  30:	00 
  31:	eb 08                	jmp    3b <_main+0x31>
  33:	c7 44 24 0c 05 00 00 	movl   $0x5,0xc(%esp)		//函数fn2地址赋予pf
  3a:	00 
  3b:	8b 44 24 0c          	mov    0xc(%esp),%eax
  3f:	ff d0                	call   *%eax			//通过函数指针pf调用函数
  41:	ff 44 24 08          	incl   0x8(%esp)
  45:	81 7c 24 08 e7 03 00 	cmpl   $0x3e7,0x8(%esp)
  4c:	00 
  4d:	0f 9e c0             	setle  %al
  50:	84 c0                	test   %al,%al
  52:	75 ce                	jne    22 <_main+0x18>
  54:	b8 00 00 00 00       	mov    $0x0,%eax
  59:	c9                   	leave  
  5a:	c3                   	ret    
  5b:	90                   	nop

上面代码逻辑很清楚了，循环根据条件修改pf变量，而pf会读取到eax寄存器，然后通过call *%eax进行函数调用，那么如果在call *%eax处内联函数，则根本没法解决到底内联fn1还是fn2的问题。
在gcc O3下生成的汇编如下：

main.o:     file format pe-i386


Disassembly of section .text:

00000000 <__Z3fn1v>:
   0:	55                   	push   %ebp
   1:	89 e5                	mov    %esp,%ebp
   3:	c9                   	leave  
   4:	c3                   	ret    
   5:	8d 76 00             	lea    0x0(%esi),%esi

00000008 <__Z3fn2v>:
   8:	55                   	push   %ebp
   9:	89 e5                	mov    %esp,%ebp
   b:	c9                   	leave  
   c:	c3                   	ret    
   d:	8d 76 00             	lea    0x0(%esi),%esi

00000010 <_main>:
  10:	55                   	push   %ebp
  11:	89 e5                	mov    %esp,%ebp
  13:	83 e4 f0             	and    $0xfffffff0,%esp
  16:	53                   	push   %ebx
  17:	83 ec 0c             	sub    $0xc,%esp
  1a:	e8 00 00 00 00       	call   1f <_main+0xf>
  1f:	31 db                	xor    %ebx,%ebx	//ebx清零
  21:	b8 08 00 00 00       	mov    $0x8,%eax	//函数fn2地址赋予eax
  26:	66 90                	xchg   %ax,%ax		//2字节无用指令对齐地址位(追求4整数地址)？不太确定
  28:	ff d0                	call   *%eax		//调用fn2
  2a:	43                   	inc    %ebx		//i++
  2b:	81 fb e8 03 00 00    	cmp    $0x3e8,%ebx	//判断循环，ebx充当i
  31:	74 15                	je     48 <_main+0x38>	//相等结束循环
  33:	83 fb 14             	cmp    $0x14,%ebx	//i和20比较
  36:	7f 18                	jg     50 <_main+0x40>	//i>20跳转到50
  38:	b8 08 00 00 00       	mov    $0x8,%eax	//函数fn2地址赋予eax
  3d:	ff d0                	call   *%eax		//调用fn2
  3f:	43                   	inc    %ebx
  40:	81 fb e8 03 00 00    	cmp    $0x3e8,%ebx
  46:	75 eb                	jne    33 <_main+0x23>
  48:	31 c0                	xor    %eax,%eax
  4a:	83 c4 0c             	add    $0xc,%esp
  4d:	5b                   	pop    %ebx
  4e:	c9                   	leave  
  4f:	c3                   	ret    
  50:	b8 00 00 00 00       	mov    $0x0,%eax	////函数fn1地址赋予eax
  55:	eb d1                	jmp    28 <_main+0x18>
  57:	90                   	nop

可以看到在O3优化下，编译器使用了寄存器来代替函数指针变量pf和循环变量i，但依然无法将fn1和fn2内联化。

下面看个稍复杂点的例子：

void fn1(){
}
void fn2(){
}
bool isFn2(void (*pf)(void)){
    if(pf==&fn2) {
        return true;
    }
    return false;
}
int main(){
    void (*pf)(void);
    for(int i=0;i<1000;i++) {
        if(isFn2(pf)) {
            pf=&fn1;
        }else{
            pf=&fn2;
        }
        (*pf)();
    }
}

上面代码逻辑可以看到，将调用的函数指针变量pf到底是否指向fn1还是fn2取决于函数isFn2()的返回，isFn2()会根据当前的函数指针pf指向来来判断返回结果。
在O0优化下，下面可以很明了的看到其跳转逻辑：

00000000 <__Z3fn1v>:
   0:	55                   	push   %ebp
   1:	89 e5                	mov    %esp,%ebp
   3:	c9                   	leave  
   4:	c3                   	ret    

00000005 <__Z3fn2v>:
   5:	55                   	push   %ebp
   6:	89 e5                	mov    %esp,%ebp
   8:	c9                   	leave  
   9:	c3                   	ret    

0000000a <__Z5isFn2PFvvE>:
   a:	55                   	push   %ebp
   b:	89 e5                	mov    %esp,%ebp
   d:	81 7d 08 05 00 00 00 	cmpl   $0x5,0x8(%ebp)
  14:	75 04                	jne    1a <__Z5isFn2PFvvE+0x10>
  16:	b0 01                	mov    $0x1,%al
  18:	eb 02                	jmp    1c <__Z5isFn2PFvvE+0x12>
  1a:	b0 00                	mov    $0x0,%al
  1c:	c9                   	leave  
  1d:	c3                   	ret    

0000001e <_main>:
  1e:	55                   	push   %ebp
  1f:	89 e5                	mov    %esp,%ebp
  21:	83 e4 f0             	and    $0xfffffff0,%esp
  24:	83 ec 20             	sub    $0x20,%esp
  27:	e8 00 00 00 00       	call   2c <_main+0xe>
  2c:	c7 44 24 18 00 00 00 	movl   $0x0,0x18(%esp)
  33:	00 
  34:	eb 2c                	jmp    62 <_main+0x44>
  36:	8b 44 24 1c          	mov    0x1c(%esp),%eax
  3a:	89 04 24             	mov    %eax,(%esp)
  3d:	e8 c8 ff ff ff       	call   a <__Z5isFn2PFvvE>
  42:	84 c0                	test   %al,%al
  44:	74 0a                	je     50 <_main+0x32>
  46:	c7 44 24 1c 00 00 00 	movl   $0x0,0x1c(%esp)
  4d:	00 
  4e:	eb 08                	jmp    58 <_main+0x3a>
  50:	c7 44 24 1c 05 00 00 	movl   $0x5,0x1c(%esp)
  57:	00 
  58:	8b 44 24 1c          	mov    0x1c(%esp),%eax
  5c:	ff d0                	call   *%eax
  5e:	ff 44 24 18          	incl   0x18(%esp)
  62:	81 7c 24 18 e7 03 00 	cmpl   $0x3e7,0x18(%esp)
  69:	00 
  6a:	0f 9e c0             	setle  %al
  6d:	84 c0                	test   %al,%al
  6f:	75 c5                	jne    36 <_main+0x18>
  71:	b8 00 00 00 00       	mov    $0x0,%eax
  76:	c9                   	leave  
  77:	c3                   	ret

上面可以看到isFn2()函数会被函数main所调用( call a <__Z5isFn2PFvvE> )，并且在O0优化下是不会被内联的。
但在O3优化下，情况又有所不同：

00000000 <__Z3fn1v>:
   0:	55                   	push   %ebp
   1:	89 e5                	mov    %esp,%ebp
   3:	c9                   	leave  
   4:	c3                   	ret    
   5:	8d 76 00             	lea    0x0(%esi),%esi

00000008 <__Z3fn2v>:
   8:	55                   	push   %ebp
   9:	89 e5                	mov    %esp,%ebp
   b:	c9                   	leave  
   c:	c3                   	ret    
   d:	8d 76 00             	lea    0x0(%esi),%esi

00000010 <__Z5isFn2PFvvE>:
  10:	55                   	push   %ebp
  11:	89 e5                	mov    %esp,%ebp
  13:	81 7d 08 08 00 00 00 	cmpl   $0x8,0x8(%ebp)   <---------------------------
  1a:	0f 94 c0             	sete   %al
  1d:	c9                   	leave  
  1e:	c3                   	ret    
  1f:	90                   	nop

00000020 <_main>:
  20:	55                   	push   %ebp
  21:	89 e5                	mov    %esp,%ebp
  23:	83 e4 f0             	and    $0xfffffff0,%esp
  26:	56                   	push   %esi
  27:	53                   	push   %ebx
  28:	83 ec 08             	sub    $0x8,%esp
  2b:	e8 00 00 00 00       	call   30 <_main+0x10>
  30:	bb e8 03 00 00       	mov    $0x3e8,%ebx
  35:	eb 0b                	jmp    42 <_main+0x22>
  37:	90                   	nop
  38:	be 08 00 00 00       	mov    $0x8,%esi
  3d:	ff d6                	call   *%esi
  3f:	4b                   	dec    %ebx
  40:	74 12                	je     54 <_main+0x34>
  42:	81 fe 08 00 00 00    	cmp    $0x8,%esi         <---------------------------
  48:	75 ee                	jne    38 <_main+0x18>
  4a:	be 00 00 00 00       	mov    $0x0,%esi
  4f:	ff d6                	call   *%esi
  51:	4b                   	dec    %ebx
  52:	75 ee                	jne    42 <_main+0x22>
  54:	31 c0                	xor    %eax,%eax
  56:	83 c4 08             	add    $0x8,%esp
  59:	5b                   	pop    %ebx
  5a:	5e                   	pop    %esi
  5b:	c9                   	leave  
  5c:	c3                   	ret    
  5d:	90                   	nop
  5e:	90                   	nop
  5f:	90                   	nop

注意上面箭头处是被内联化了的isFn2()代码。
可以看到对于函数指针变量pf的调用call *%esi，说明这个地方仍然是无法被内联化的。

通过函数指针调用的函数不能内联化，因此通过基类指针调用的多态函数自然也就无法被内联化，因为多态函数实际是通过虚函数表和偏移项来定位实际调用的函数指针，然后通过这个函数指针访问实际的函数代码。

通过函数指针调用的函数甚至也是不可能被优化消除的。
举个例子：

class A{
public:
    void virtual fn(){}
};
class SubA:public A{
public:
    void virtual fn(){}
};
int main(){
    SubA suba;
    A* a=&suba;
    a->fn();
}

这段代码在O3优化下，汇编为：

00000014 <_main>:
  14:	55                   	push   %ebp
  15:	89 e5                	mov    %esp,%ebp
  17:	83 e4 f0             	and    $0xfffffff0,%esp
  1a:	83 ec 20             	sub    $0x20,%esp
  1d:	e8 00 00 00 00       	call   22 <_main+0xe>
  22:	c7 44 24 1c 08 00 00 	movl   $0x8,0x1c(%esp)
  29:	00 
  2a:	8d 44 24 1c          	lea    0x1c(%esp),%eax
  2e:	89 04 24             	mov    %eax,(%esp)
  31:	ff 15 08 00 00 00    	call   *0x8
  37:	31 c0                	xor    %eax,%eax
  39:	c9                   	leave  
  3a:	c3                   	ret    
  3b:	90                   	nop

可以看到函数fn()仍然通过虚表的虚函数指针被调用(call   *0x8).
而如果代码
int main(){
    SubA suba;
    A* a=&suba;
    a->fn();
} 换为：
int main(){
    SubA suba;
    SubA* a=&suba;
    a->fn();
}
则对应的O3优化为：

00000014 <_main>:
  14:	55                   	push   %ebp
  15:	89 e5                	mov    %esp,%ebp
  17:	83 e4 f0             	and    $0xfffffff0,%esp
  1a:	e8 00 00 00 00       	call   1f <_main+0xb>
  1f:	31 c0                	xor    %eax,%eax
  21:	c9                   	leave  
  22:	c3                   	ret    
  23:	90                   	nop

上面可以看到O3将 a->fn()的调用完全优化清除掉了。

根据文章开头所给链接的文章提到，Java能运行时动态将基类指针的多态调用替换成内联，那么有个疑问，对于这样逻辑的代码：
    for(int i=0;i<1000;i++) {
        base=get RandomBaseOrChild();
        base.fn();
    }
java又如何能做到动态内联呢？

分享到：