openssl - Mirror of https://github.com/openssl/openssl

diff options

author	Andy Polyakov <appro@openssl.org>	2009-12-27 20:38:32 +0000
committer	Andy Polyakov <appro@openssl.org>	2009-12-27 20:38:32 +0000
commit	beef7145997c1183c8c154076d3fa3a7148ada60 (patch)
tree	45792236bc9a9fd3b2342536be01cb0cd3343a92 /ms/do_win64a.bat
parent	d741cf2267eca194817d300912b35da02806ca3e (diff)

Switch to new uplink assembler.

Diffstat (limited to 'ms/do_win64a.bat')

-rwxr-xr-x

ms/do_win64a.bat

1 files changed, 12 insertions, 2 deletions

######################################################################## # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions # # Copyright (c) 2013, Intel Corporation # # Authors: # Erdinc Ozturk <erdinc.ozturk@intel.com> # Vinodh Gopal <vinodh.gopal@intel.com> # James Guilford <james.guilford@intel.com> # Tim Chen <tim.c.chen@linux.intel.com> # # This software is available to you under a choice of one of two # licenses. You may choose to be licensed under the terms of the GNU # General Public License (GPL) Version 2, available from the file # COPYING in the main directory of this source tree, or the # OpenIB.org BSD license below: # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the # distribution. # # * Neither the name of the Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ######################################################################## # Function API: # UINT16 crc_t10dif_pcl( # UINT16 init_crc, //initial CRC value, 16 bits # const unsigned char *buf, //buffer pointer to calculate CRC on # UINT64 len //buffer length in bytes (64-bit data) # ); # # Reference paper titled "Fast CRC Computation for Generic # Polynomials Using PCLMULQDQ Instruction" # URL: http://www.intel.com/content/dam/www/public/us/en/documents # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf # # #include <linux/linkage.h> .text #define arg1 %rdi #define arg2 %rsi #define arg3 %rdx #define arg1_low32 %edi ENTRY(crc_t10dif_pcl) .align 16 # adjust the 16-bit initial_crc value, scale it to 32 bits shl $16, arg1_low32 # Allocate Stack Space mov %rsp, %rcx sub $16*2, %rsp # align stack to 16 byte boundary and $~(0x10 - 1), %rsp # check if smaller than 256 cmp $256, arg3 # for sizes less than 128, we can't fold 64B at a time... jl _less_than_128 # load the initial crc value movd arg1_low32, %xmm10 # initial crc # crc value does not need to be byte-reflected, but it needs # to be moved to the high part of the register. # because data will be byte-reflected and will align with # initial crc at correct place. pslldq $12, %xmm10 movdqa SHUF_MASK(%rip), %xmm11 # receive the initial 64B data, xor the initial crc value movdqu 16*0(arg2), %xmm0 movdqu 16*1(arg2), %xmm1 movdqu 16*2(arg2), %xmm2 movdqu 16*3(arg2), %xmm3 movdqu 16*4(arg2), %xmm4 movdqu 16*5(arg2), %xmm5 movdqu 16*6(arg2), %xmm6 movdqu 16*7(arg2), %xmm7 pshufb %xmm11, %xmm0 # XOR the initial_crc value pxor %xmm10, %xmm0 pshufb %xmm11, %xmm1 pshufb %xmm11, %xmm2 pshufb %xmm11, %xmm3 pshufb %xmm11, %xmm4 pshufb %xmm11, %xmm5 pshufb %xmm11, %xmm6 pshufb %xmm11, %xmm7 movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 #imm value of pclmulqdq instruction #will determine which constant to use ################################################################# # we subtract 256 instead of 128 to save one instruction from the loop sub $256, arg3 # at this section of the code, there is 64*x+y (0<=y<64) bytes of # buffer. The _fold_64_B_loop will fold 64B at a time # until we have 64+y Bytes of buffer # fold 64B at a time. This section of the code folds 4 xmm # registers in parallel _fold_64_B_loop: # update the buffer pointer add $128, arg2 # buf += 64# movdqu 16*0(arg2), %xmm9 movdqu 16*1(arg2), %xmm12 pshufb %xmm11, %xmm9 pshufb %xmm11, %xmm12 movdqa %xmm0, %xmm8 movdqa %xmm1, %xmm13 pclmulqdq $0x0 , %xmm10, %xmm0 pclmulqdq $0x11, %xmm10, %xmm8 pclmulqdq $0x0 , %xmm10, %xmm1 pclmulqdq $0x11, %xmm10, %xmm13 pxor %xmm9 , %xmm0 xorps %xmm8 , %xmm0 pxor %xmm12, %xmm1 xorps %xmm13, %xmm1 movdqu 16*2(arg2), %xmm9 movdqu 16*3(arg2), %xmm12 pshufb %xmm11, %xmm9 pshufb %xmm11, %xmm12 movdqa %xmm2, %xmm8 movdqa %xmm3, %xmm13 pclmulqdq $0x0, %xmm10, %xmm2 pclmulqdq $0x11, %xmm10, %xmm8 pclmulqdq $0x0, %xmm10, %xmm3 pclmulqdq $0x11, %xmm10, %xmm13 pxor %xmm9 , %xmm2 xorps %xmm8 , %xmm2 pxor %xmm12, %xmm3 xorps %xmm13, %xmm3 movdqu 16*4(arg2), %xmm9 movdqu 16*5(arg2), %xmm12 pshufb %xmm11, %xmm9 pshufb %xmm11, %xmm12 movdqa %xmm4, %xmm8 movdqa %xmm5, %xmm13 pclmulqdq $0x0, %xmm10, %xmm4 pclmulqdq $0x11, %xmm10, %xmm8 pclmulqdq $0x0, %xmm10, %xmm5 pclmulqdq $0x11, %xmm10, %xmm13 pxor %xmm9 , %xmm4 xorps %xmm8 , %xmm4 pxor %xmm12, %xmm5 xorps %xmm13, %xmm5 movdqu 16*6(arg2), %xmm9 movdqu 16*7(arg2), %xmm12 pshufb %xmm11, %xmm9 pshufb %xmm11, %xmm12 movdqa %xmm6 , %xmm8 movdqa %xmm7 , %xmm13 pclmulqdq $0x0 , %xmm10, %xmm6 pclmulqdq $0x11, %xmm10, %xmm8 pclmulqdq $0x0 , %xmm10, %xmm7 pclmulqdq $0x11, %xmm10, %xmm13 pxor %xmm9 , %xmm6 xorps %xmm8 , %xmm6 pxor %xmm12, %xmm7 xorps %xmm13, %xmm7 sub $128, arg3 # check if there is another 64B in the buffer to be able to fold jge _fold_64_B_loop ################################################################## add $128, arg2 # at this point, the buffer pointer is pointing at the last y Bytes # of the buffer the 64B of folded data is in 4 of the xmm # registers: xmm0, xmm1, xmm2, xmm3 # fold the 8 xmm registers to 1 xmm register with different constants movdqa rk9(%rip), %xmm10 movdqa %xmm0, %xmm8 pclmulqdq $0x11, %xmm10, %xmm0 pclmulqdq $0x0 , %xmm10, %xmm8 pxor %xmm8, %xmm7 xorps %xmm0, %xmm7 movdqa rk11(%rip), %xmm10 movdqa %xmm1, %xmm8 pclmulqdq $0x11, %xmm10, %xmm1 pclmulqdq $0x0 , %xmm10, %xmm8 pxor %xmm8, %xmm7 xorps %xmm1, %xmm7 movdqa rk13(%rip), %xmm10 movdqa %xmm2, %xmm8 pclmulqdq $0x11, %xmm10, %xmm2 pclmulqdq $0x0 , %xmm10, %xmm8 pxor %xmm8, %xmm7 pxor %xmm2, %xmm7 movdqa rk15(%rip), %xmm10 movdqa %xmm3, %xmm8 pclmulqdq $0x11, %xmm10, %xmm3 pclmulqdq $0x0 , %xmm10, %xmm8 pxor %xmm8, %xmm7 xorps %xmm3, %xmm7 movdqa rk17(%rip), %xmm10 movdqa %xmm4, %xmm8 pclmulqdq $0x11, %xmm10, %xmm4 pclmulqdq $0x0 , %xmm10, %xmm8 pxor %xmm8, %xmm7 pxor %xmm4, %xmm7 movdqa rk19(%rip), %xmm10 movdqa %xmm5, %xmm8 pclmulqdq $0x11, %xmm10, %xmm5 pclmulqdq $0x0 ,


context:
space:
mode: