liblzma: SHA-256: Unroll a little more.

This way a branch isn't needed for each operation
to choose between blk0 and blk2, and still the code
doesn't grow as much as it would with full unrolling.
This commit is contained in:
Lasse Collin 2014-08-03 20:33:38 +03:00
parent bc7650d87b
commit 9a096f8e57
1 changed files with 16 additions and 9 deletions

View File

@ -47,11 +47,12 @@
#define g(i) T[(6 - i) & 7]
#define h(i) T[(7 - i) & 7]
#define R(i) \
h(i) += S1(e(i)) + Ch(e(i), f(i), g(i)) + SHA256_K[i + j] \
+ (j ? blk2(i) : blk0(i)); \
#define R(i, j, blk) \
h(i) += S1(e(i)) + Ch(e(i), f(i), g(i)) + SHA256_K[i + j] + blk; \
d(i) += h(i); \
h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
#define R0(i) R(i, 0, blk0(i))
#define R2(i) R(i, j, blk2(i))
#define S0(x) (rotr_32(x, 2) ^ rotr_32(x, 13) ^ rotr_32(x, 22))
#define S1(x) (rotr_32(x, 6) ^ rotr_32(x, 11) ^ rotr_32(x, 25))
@ -88,12 +89,18 @@ transform(uint32_t state[8], const uint32_t data[16])
// Copy state[] to working vars.
memcpy(T, state, sizeof(T));
// 64 operations, partially loop unrolled
for (unsigned int j = 0; j < 64; j += 16) {
R( 0); R( 1); R( 2); R( 3);
R( 4); R( 5); R( 6); R( 7);
R( 8); R( 9); R(10); R(11);
R(12); R(13); R(14); R(15);
// The first 16 operations unrolled
R0( 0); R0( 1); R0( 2); R0( 3);
R0( 4); R0( 5); R0( 6); R0( 7);
R0( 8); R0( 9); R0(10); R0(11);
R0(12); R0(13); R0(14); R0(15);
// The remaining 48 operations partially unrolled
for (unsigned int j = 16; j < 64; j += 16) {
R2( 0); R2( 1); R2( 2); R2( 3);
R2( 4); R2( 5); R2( 6); R2( 7);
R2( 8); R2( 9); R2(10); R2(11);
R2(12); R2(13); R2(14); R2(15);
}
// Add the working vars back into state[].