|
Message-ID: <CA+TsHUAcZA5s9s-co3xmOT9+buawXbPn2Qc4KxT-Cvnsm=TwmA@mail.gmail.com> Date: Thu, 18 Sep 2014 18:31:15 +0530 From: Sayantan Datta <std2048@...il.com> To: john-users@...ts.openwall.com Subject: Re: Re: nVidia Maxwell support (especially descrypt)? On Thu, Sep 18, 2014 at 5:36 PM, Roman Rusakov <rusakovster@...il.com> wrote: > Good day. > > I started work to optimize bitslice DES sboxes for Nvidia Maxwell > instruction set (particularly for LOP3.LUT). > Looks like gate count can be decreased by 9 (from ~33 to 24). > For example, S4 (only 17 gates). > === > #include <stdio.h> > > typedef unsigned long long vtype; > > vtype lut3(vtype x,vtype y,vtype z,unsigned char m) > { > int i; > vtype r=0; > for(i=0;i<sizeof(vtype)*8;i++) > r|=(vtype)((m>>( (((x>>i)&1)<<2) | (((y>>i)&1)<<1) | ((z>>i)&1) ))&1) << i; > return r; > } > > void s4(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, > vtype * out1, vtype * out2, vtype * out3, vtype * out4) > { > vtype x55AAFF00=lut3(a1,a4,a5,0x36); > vtype x00F00F00=lut3(a3,a4,a5,0x24); > vtype x1926330C=lut3(a2,a3,x55AAFF00,0xA4); > vtype x4CA36B59=lut3(x00F00F00,a1,x1926330C,0xB6); > > vtype x00FF55AA=lut3(a1,a4,a5,0x6C); > vtype x3FCC6E9D=lut3(a2,a3,x00FF55AA,0x5E); > vtype x6A7935C8=lut3(a1,x00F00F00,x3FCC6E9D,0xD6); > > vtype x5D016B55=lut3(a1,x4CA36B59,x00FF55AA,0xD4); > vtype x07AE9F5A=lut3(a3,x55AAFF00,x5D016B55,0xD6); > vtype x61C8F93C=lut3(a1,a2,x07AE9F5A,0x96); > > vtype x3=lut3(a6,x4CA36B59,x61C8F93C,0xC9); > vtype x4=lut3(a6,x4CA36B59,x61C8F93C,0x93); > *out3^=x3; > *out4^=x4; > > vtype x26DA5E91=x4CA36B59^x6A7935C8; > vtype x37217F22=lut3(a2,a4,x26DA5E91,0x72); > vtype x56E9861E=x37217F22^x61C8F93C; > > vtype x1=lut3(a6,x56E9861E,x6A7935C8,0x5C); > vtype x2=lut3(a6,x56E9861E,x6A7935C8,0x35); > *out1^=x1; > *out2^=x2; > } > > void main() > { > vtype a1,a2,a3,a4,a5,a6,x1,x2,x3,x4; > a1=0x5555555555555555; > a2=0x3333333333333333; > a3=0x0F0F0F0F0F0F0F0F; > a4=0x00FF00FF00FF00FF; > a5=0x0000FFFF0000FFFF; > a6=0x00000000FFFFFFFF; > x1=x2=x3=x4=0; > s4(a1,a2,a3,a4,a5,a6,&x1,&x2,&x3,&x4); > printf("%016llX\n",x1); > printf("%016llX\n",x2); > printf("%016llX\n",x3); > printf("%016llX\n",x4); > } > === > How much impact does it have on performance ? I guess it also reduces the number of instruction which could translate into better utilization of i-cache provided you do the same for all 8 sboxes. Regards, Sayantan
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.