一段看上去“貌不惊人”的Delphi插入汇编代码,却需要授权许可,但是与经典的同类型函数比较,确实“身手不凡”。
研究代码的目的在于借鉴,本文通过分析,并用C++重写代码进行比较,再次证明这段代码效率较高的主要原因在于思路(或者算法),与语言本身效率关系不大。今天打开Delphi2007的SysUtils.pas文件查看一个函数代码,偶尔看到字符串拷贝函数StrCopy中的插入汇编代码,感觉与记忆中Delphi7的同名函数中的代码大不相同,我的汇编水平虽不算精通,但自认还过得去,但粗粗看了一下,竟没完全看明白。找出Delphi7的StrCopy代码初步比较分析了一下,给我的第一印象是Delphi2007的StrCopy函数代码既粗燥,又难懂,拷贝速度也肯定不及Delphi7的StrCopy函数。下面分别摘入这两段代码,相信不少人都会有我类似的感觉:Delphi7的StrCopy函数代码:function StrCopy(Dest: PChar; const Source: PChar): PChar; asm PUSH EDI PUSH ESI MOV ESI,EAX MOV EDI,EDX MOV ECX,0FFFFFFFFH XOR AL,AL REPNE SCASB NOT ECX MOV EDI,ESI MOV ESI,EDX MOV EDX,ECX MOV EAX,EDI SHR ECX,2 REP MOVSD MOV ECX,EDX AND ECX,3 REP MOVSB POP ESI POP EDI end; function StrCopy(Dest: PChar; const Source: PChar): PChar;asmPUSH EDIPUSH ESIMOV ESI,EAXMOV EDI,EDXMOV ECX,0FFFFFFFFHXOR AL,ALREPNE SCASBNOT ECXMOV EDI,ESIMOV ESI,EDXMOV EDX,ECXMOV EAX,EDISHR ECX,2REP MOVSDMOV ECX,EDXAND ECX,3REP MOVSBPOP ESIPOP EDIend; Delphi2007的StrCopy函数代码:function StrCopy(Dest: PChar; const Source: PChar): PChar; asm sub edx, eax test eax, 1 push eax jz @loop movzx ecx, byte ptr[eax+edx] mov [eax], cl test ecx, ecx jz @ret inc eax @loop: movzx ecx, byte ptr[eax+edx] test ecx, ecx jz @move0 movzx ecx, word ptr[eax+edx] mov [eax], cx add eax, 2 cmp ecx, 255 ja @loop @ret: pop eax ret @move0: mov [eax], cl pop eax end; function StrCopy(Dest: PChar; const Source: PChar): PChar;asmsub edx, eaxtest eax, 1push eaxjz @loopmovzx ecx, byte ptr[eax+edx]mov [eax], cltest ecx, ecxjz @retinc eax@loop:movzx ecx, byte ptr[eax+edx]test ecx, ecxjz @move0movzx ecx, word ptr[eax+edx]mov [eax], cxadd eax, 2cmp ecx, 255ja @loop@ret:pop eaxret@move0:mov [eax], clpop eaxend; 正感叹难怪Delphi每况日下,连库代码都改得如此之差,反过来又一想,如果这段代码比以前的代码还差,为什么要改呢?难道CodeGear的程序员水平如此之差?抱着疑问,又找出Delphi2010的StrCopy函数,除了PChar为PAnsiChar外,其它与Delphi2007一样。这才想到这段代码肯定有它的过人之处!果然,在Delphi2007和Delphi2010的StrCopy函数前有一段注释,被我这完全不懂英语的人给忽略了:(* ***** BEGIN LICENSE BLOCK ***** * * The function StrCopy is licensed under the CodeGear license terms. * * The initial developer of the original code is Fastcode * * Portions created by the initial developer are Copyright (C) 2002-2004 * the initial developer. All Rights Reserved. * * Contributor(s): Aleksandr Sharahov * * ***** END LICENSE BLOCK ***** *) (* ***** BEGIN LICENSE BLOCK ******* The function StrCopy is licensed under the CodeGear license terms.** The initial developer of the original code is Fastcode** Portions created by the initial developer are Copyright (C) 2002-2004* the initial developer. All Rights Reserved.** Contributor(s): Aleksandr Sharahov** ***** END LICENSE BLOCK ***** *) 用网上Google的在线翻译翻译了一下,这才知道,原来这段代码还是有授权许可的!这才真是“人不可貌相”啊。若干年前,小平同志就教导过我们:“实践是检验真理的唯一标准”,照他的话办应该没错。于是将这两段代码摘入下来,分别改名为StrCopy7和StrCopy2007,写了一段简单代码,用80兆字节的字符串进行了一下速度测试:const TestSize = 80 * 1024 * 1024 + 2; var Dest, Source: PChar; p, pe: PChar; TickCount7, TickCount2007: Longword; begin GetMem(Source, TestSize); GetMem(Dest, TestSize); Randomize; p := Source; pe := p + TestSize - 1; while p < pe do begin p^ := char(Random(255)); if p^ >= #32 then Inc(p); end; p^ := #0; TickCount7 := GetTickCount; StrCopy7(Dest, Source); TickCount7 := GetTickCount - TickCount7; TickCount2007 := GetTickCount; StrCopy2007(Dest, Source); TickCount2007 := GetTickCount - TickCount2007; FreeMem(Dest); FreeMem(Source); ShowMessage(Format('StrCopy7: %d, StrCopy2007: %d', [TickCount7, TickCount2007])); end; constTestSize = 80 * 1024 * 1024 + 2;varDest, Source: PChar;p, pe: PChar;TickCount7, TickCount2007: Longword;beginGetMem(Source, TestSize);GetMem(Dest, TestSize);Randomize;p := Source;pe := p + TestSize - 1;while p < pe dobeginp^ := char(Random(255));if p^ >= #32 then Inc(p);end;p^ := #0;TickCount7 := GetTickCount;StrCopy7(Dest, Source);TickCount7 := GetTickCount - TickCount7;TickCount2007 := GetTickCount;StrCopy2007(Dest, Source);TickCount2007 := GetTickCount - TickCount2007;FreeMem(Dest);FreeMem(Source);ShowMessage(Format('StrCopy7: %d, StrCopy2007: %d', [TickCount7, TickCount2007]));end; 测试出的结果超出我的预料:StrCopy7与StrCopy2007的拷贝速度竟然相差2.5 - 4倍!呵呵,果然是有“授权许可”的代码呀,还真是“身手不凡”,要知道StrCopy7采用的并非一般的单字节拷贝,而是采用的每次4字节拷贝,本身就是一段相当高效的字符串拷贝代码,比它还高出2.5 - 4倍速度的代码,还真叫人难以相信!为了让有些不大懂汇编的朋友也能欣赏到这段“貌不惊人”代码,我给这2段代码逐句加上汉字注释贴在下面(文章后面用C++重写了这2段代码):// in: eax=dest,edx=Source out: eax=Dest function StrCopy7(Dest: PChar; const Source: PChar): PChar; asm PUSH EDI PUSH ESI MOV ESI,EAX // 保存Dest在esi // 计算字符串Source的长度 MOV EDI,EDX // edi = Source MOV ECX,0FFFFFFFFH // ecx = 最大无符号长整型数 XOR AL,AL // al = 0(0为C语言字符串结束符) REPNE SCASB // 在Source中查找结束符位置 NOT ECX // ecx取反为Source长度(包括结束符在内) // 拷贝Source到Dest(包括结束符在内) MOV EDI,ESI // edi = Dest MOV ESI,EDX // esi = Source MOV EDX,ECX // 保存Source的长度在edx MOV EAX,EDI // eax = Dest(函数返回值) SHR ECX,2 // ecx /= 4 REP MOVSD // 按每次4字节进行循环拷贝 MOV ECX,EDX AND ECX,3 // ecx = edx % 4(按4字节拷贝后的剩余字节) REP MOVSB // 按单字节拷贝循环拷贝剩余字节 POP ESI POP EDI end; // in: eax=dest,edx=Source out: eax=Dest function StrCopy2007(Dest: PChar; const Source: PChar): PChar; asm sub edx, eax // Source地址减Dest地址 test eax, 1 // 测试Dest地址值是否为奇数 push eax // 保存函数返回植 jz @loop movzx ecx, byte ptr[eax+edx] // 如果Dest地址值为奇数 mov [eax], cl // 拷贝Source的一字节到Dest test ecx, ecx // 如果是字符串结束符,返回Dest jz @ret inc eax // 否则Dest地址值调整为偶数 @loop: // 循环逐字拷贝Source到Dest movzx ecx, byte ptr[eax+edx] // 从Source中预读一字节 test ecx, ecx // 如果是字符串结束符,拷贝后返回Dest jz @move0 movzx ecx, word ptr[eax+edx] // 拷贝Source的一字到Dest mov [eax], cx add eax, 2 // Dest地址值加2,因edx为Source与Dest之差, // eax+edx为Source地址下一地址值 cmp ecx, 255 // 如果已拷贝字大于255,继续下一字拷贝。 // 注:因前面已通过预读对结束符进行判断处理, // 故已拷贝字低字节不可能为0,所以已拷贝字 // <=255,说明其高字节为0,拷贝结束 ja @loop @ret: pop eax ret @move0: mov [eax], cl pop eax end; // in: eax=dest,edx=Source out: eax=Destfunction StrCopy7(Dest: PChar; const Source: PChar): PChar;asmPUSH EDIPUSH ESIMOV ESI,EAX // 保存Dest在esi// 计算字符串Source的长度MOV EDI,EDX // edi = SourceMOV ECX,0FFFFFFFFH // ecx = 最大无符号长整型数XOR AL,AL // al = 0(0为C语言字符串结束符)REPNE SCASB // 在Source中查找结束符位置NOT ECX // ecx取反为Source长度(包括结束符在内)// 拷贝Source到Dest(包括结束符在内)MOV EDI,ESI // edi = DestMOV ESI,EDX // esi = SourceMOV EDX,ECX // 保存Source的长度在edxMOV EAX,EDI // eax = Dest(函数返回值)SHR ECX,2 // ecx /= 4REP MOVSD // 按每次4字节进行循环拷贝MOV ECX,EDXAND ECX,3 // ecx = edx % 4(按4字节拷贝后的剩余字节)REP MOVSB // 按单字节拷贝循环拷贝剩余字节POP ESIPOP EDIend;// in: eax=dest,edx=Source out: eax=Destfunction StrCopy2007(Dest: PChar; const Source: PChar): PChar;asmsub edx, eax // Source地址减Dest地址test eax, 1 // 测试Dest地址值是否为奇数push eax // 保存函数返回植jz @loopmovzx ecx, byte ptr[eax+edx] // 如果Dest地址值为奇数mov [eax], cl // 拷贝Source的一字节到Desttest ecx, ecx // 如果是字符串结束符,返回Destjz @retinc eax // 否则Dest地址值调整为偶数@loop: // 循环逐字拷贝Source到Destmovzx ecx, byte ptr[eax+edx] // 从Source中预读一字节test ecx, ecx // 如果是字符串结束符,拷贝后返回Destjz @move0movzx ecx, word ptr[eax+edx] // 拷贝Source的一字到Destmov [eax], cxadd eax, 2 // Dest地址值加2,因edx为Source与Dest之差,// eax+edx为Source地址下一地址值cmp ecx, 255 // 如果已拷贝字大于255,继续下一字拷贝。// 注:因前面已通过预读对结束符进行判断处理,// 故已拷贝字低字节不可能为0,所以已拷贝字// <=255,说明其高字节为0,拷贝结束ja @loop@ret:pop eaxret@move0:mov [eax], clpop eaxend;我仔细分析了一下StrCopy2007比StrCopy7效率高的原因,主要有三个方面:一、StrCopy7对Source进行了2次循环处理,一次是为了计算Source的长度而进行的扫描循环,另一次是拷贝循环,这是一种传统的字符串拷贝函数编码思路;而StrCopy2007则是一次性循环处理,虽然看上去其循环过程中的代码有些“啰嗦”,但效率确实较高,也值得我们在处理类似问题上进行借鉴,这一点与语言没多大关系;二、说明汇编的字符串处理指令效率并不高,我将StrCopy7的2句主要的字符串处理语句用“啰嗦”代码进行了替换,在我的机器上拷贝速度一下就提高了38%(这个与硬件有关系)。下面代码中注释掉的是原语句,小写汇编代码是替换语句:function StrCopy_(Dest: PChar; const Source: PChar): PChar; asm PUSH EDI PUSH ESI MOV ESI,EAX MOV EDI,EDX MOV ECX,0FFFFFFFFH XOR AL,AL @loop1: inc edi dec ecx cmp al, [edi - 1] jne @loop1 // REPNE SCASB NOT ECX MOV EDI,ESI MOV ESI,EDX MOV EDX,ECX MOV EAX,EDI SHR ECX,2 push eax @loop2: mov eax, [esi] mov [edi], eax add esi, 4 add edi, 4 loop @loop2 pop eax // REP MOVSD MOV ECX,EDX AND ECX,3 REP MOVSB POP ESI POP EDI end; function StrCopy_(Dest: PChar; const Source: PChar): PChar;asmPUSH EDIPUSH ESIMOV ESI,EAXMOV EDI,EDXMOV ECX,0FFFFFFFFHXOR AL,AL@loop1:inc edidec ecxcmp al, [edi - 1]jne @loop1// REPNE SCASBNOT ECXMOV EDI,ESIMOV ESI,EDXMOV EDX,ECXMOV EAX,EDISHR ECX,2push eax@loop2:mov eax, [esi]mov [edi], eaxadd esi, 4add edi, 4loop @loop2pop eax// REP MOVSDMOV ECX,EDXAND ECX,3REP MOVSBPOP ESIPOP EDIend; 三、目标串Dest的地址偶数对齐。因为StrCopy2007是按字进行拷贝的,Dest地址的奇偶对拷贝速度有一定影响,去掉StrCopy2007中有关Dest奇偶调整的代码后,在我的机器上测试,奇数Dest地址与偶数Dest地址拷贝速度相差%14左右;不仅如此,Source地址的奇偶性也影响拷贝速度,其相差为7%左右;如果Dest和Source的地址都是奇数,拷贝速度则相差28%以上。StrCopy2007只调整了Dest地址的奇偶性,因为Source的奇偶性没法调整。很显然,上面第一点是最主要的原因,其次是第三点,这2个原因属于编程思路(或算法)问题,与语言无多大关系,这也是我分析这段代码最大的收获。为了证明这一点,按照上面2段代码的思路,用C++分别写了2个拷贝函数和测试代码,采用BCB6编译器编译,我的机器上的测试结果是StrCopy2的拷贝速度是StrCopy1的1.6 - 1.9倍。把这2段C++代码贴在下面作为本文的结尾:view plaincopy to clipboardprint?char* StrCopy1(char *dest, const char *source) { char *pd = dest; char *pe, *ps = (char*)source; int ext, size; while (*ps ++); size = ps - source; ext = size & 3; ps = (char*)source; pe = ps + (size & 0xfffffffc); for (; ps < pe; pd += 4, ps += 4) *(long*)pd = *(long*)ps; for (; ext > 0; ext --) *pd ++ = *ps ++; return dest; } char* StrCopy2(char *dest, const char *source) { char *pd = dest; int s = source - dest; if ((unsigned)pd & 1) { *pd = *source; if (*pd == 0) return dest; pd ++; } while (true) { if (*(pd + s) == 0) break; *(short*)pd = *(short*)(pd + s); if (*(unsigned short*)pd <= 255) return dest; pd += 2; } *pd = 0; return dest; } #define TESTSIZE (80 * 1024 * 1024 + 2) void __fastcall TForm1::Button1Click(TObject *Sender) { unsigned long time1, time2; char *dest = new char[TESTSIZE]; char *source = new char[TESTSIZE]; char *p = source; char *pe = p + TESTSIZE - 1; randomize(); while (p < pe) { *p = random(255); if (*p >= 32) p ++; } *p = 0; time1 = GetTickCount(); StrCopy1(dest, source); time1 = GetTickCount() - time1; time2 = GetTickCount(); StrCopy2(dest, source); time2 = GetTickCount() - time2; delete[] source; delete[] dest; ShowMessage("StrCopy1: " + String(time1) + " StrCopy2: " + String(time2)); } char* StrCopy1(char *dest, const char *source){ char *pd = dest;char *pe, *ps = (char*)source;int ext, size;while (*ps ++);size = ps - source;ext = size & 3;ps = (char*)source;pe = ps + (size & 0xfffffffc);for (; ps < pe; pd += 4, ps += 4)*(long*)pd = *(long*)ps;for (; ext > 0; ext --)*pd ++ = *ps ++;return dest;}char* StrCopy2(char *dest, const char *source){ char *pd = dest;int s = source - dest;if ((unsigned)pd & 1){ *pd = *source;if (*pd == 0)return dest;pd ++;}while (true){ if (*(pd + s) == 0)break;*(short*)pd = *(short*)(pd + s);if (*(unsigned short*)pd <= 255)return dest;pd += 2;}*pd = 0;return dest;}#define TESTSIZE (80 * 1024 * 1024 + 2)void __fastcall TForm1::Button1Click(TObject *Sender){ unsigned long time1, time2;char *dest = new char[TESTSIZE];char *source = new char[TESTSIZE];char *p = source;char *pe = p + TESTSIZE - 1;randomize();while (p < pe){ *p = random(255);if (*p >= 32) p ++;}*p = 0;time1 = GetTickCount();StrCopy1(dest, source);time1 = GetTickCount() - time1;time2 = GetTickCount();StrCopy2(dest, source);time2 = GetTickCount() - time2;delete[] source;delete[] dest;ShowMessage("StrCopy1: " + String(time1) + " StrCopy2: " + String(time2));} 当然,由于现在计算机处理速度很快,且一般程序中极少有大容量的字符串拷贝,对一般字符串拷贝来说,StrCopy7和StrCopy2007的拷贝速度差距可忽略不计,本文的主要目的在于对优秀代码的欣赏和借鉴。由于水平有限,代码分析可能有错误,望指出,不甚感激。邮件地址:maozefa@hotmail.com