Package: gzip;
Reported by: l00374334 <liqiang64 <at> huawei.com>
Date: Tue, 26 May 2020 05:18:02 UTC
Severity: normal
Tags: patch, wontfix
View this message in rfc822 format
From: Li Qiang <liqiang64 <at> huawei.com> To: <41535 <at> debbugs.gnu.org> Cc: luanjianhai <at> huawei.com, eggert <at> cs.ucla.edu, sangyan <at> huawei.com, colordev.jiang <at> huawei.com, luchunhua <at> huawei.com, huxinwei <at> huawei.com, meyering <at> fb.com Subject: bug#41535: [PATCH] performance optimization for aarch64 Date: Sat, 30 May 2020 17:17:49 +0800
在 2020/5/26 10:39, l00374334 写道: > From: liqiang <liqiang64 <at> huawei.com> > > By analyzing the compression and decompression process of gzip, I found > > that the hot spots of CRC32 and longest_match function are very high. > > > > On the aarch64 architecture, we can optimize the efficiency of crc32 > > through the interface provided by the neon instruction set (12x faster > > in aarch64), and optimize the performance of random access code through > > prefetch instructions (about 5%~8% improvement). In some compression > > scenarios, loop expansion can also get a certain performance improvement > > (about 10%). > > > > Modify by Li Qiang. > > --- > configure | 14 ++++++++++++++ > deflate.c | 30 +++++++++++++++++++++++++++++- > util.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 88 insertions(+), 1 deletion(-) > > diff --git a/configure b/configure > index cab3daf..dc80cb6 100644 > --- a/configure > +++ b/configure > @@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext > ;; > > arm* | aarch64 ) > + cat confdefs.h - <<_ACEOF >conftest.$ac_ext > +/* end confdefs.h. */ > +#if defined __ARM_NEON__ || defined __ARM_NEON > + int ok; > + #else > + error fail > + #endif > + > +_ACEOF > +if ac_fn_c_try_compile "$LINENO" > +then : > + CFLAGS="$CFLAGS -march=armv8-a+crc" > +fi > +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext > # Assume arm with EABI. > # On arm64 systems, the C compiler may be generating code in one of > # these ABIs: > diff --git a/deflate.c b/deflate.c > index 9d379e9..ee77ffd 100644 > --- a/deflate.c > +++ b/deflate.c > @@ -378,6 +378,9 @@ longest_match(IPos cur_match) > register int len; /* length of current match */ > > int best_len = prev_length; /* best match length so far */ > > IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL; > > +#ifdef __aarch64__ > > + IPos next_match; > > +#endif > > /* Stop when cur_match becomes <= limit. To simplify the code, > > * we prevent matches with the string of window index 0. > > */ > > @@ -411,6 +414,10 @@ longest_match(IPos cur_match) > do { > > Assert(cur_match < strstart, "no future"); > > match = window + cur_match; > > +#ifdef __aarch64__ > > + next_match = prev[cur_match & WMASK]; > > + __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK]))); > > +#endif > > > > /* Skip to next match if the match length cannot increase > > * or if the match length is less than 2: > > @@ -488,8 +495,14 @@ longest_match(IPos cur_match) > scan_end = scan[best_len]; > > #endif > > } > > - } while ((cur_match = prev[cur_match & WMASK]) > limit > > + } > > +#ifdef __aarch64__ > > + while ((cur_match = next_match) > limit > > + && --chain_length != 0); > > +#else > > + while ((cur_match = prev[cur_match & WMASK]) > limit > > && --chain_length != 0); > > +#endif > > > > return best_len; > > } > > @@ -777,7 +790,22 @@ deflate (int pack_level) > lookahead -= prev_length-1; > > prev_length -= 2; > > RSYNC_ROLL(strstart, prev_length+1); > > + while (prev_length >= 4) { > > + /* After actual verification, expanding this loop > > + * can improve its performance in certain scenarios. > > + */ > > + prev_length -= 4; > > + strstart++; > > + INSERT_STRING(strstart, hash_head); > > + strstart++; > > + INSERT_STRING(strstart, hash_head); > > + strstart++; > > + INSERT_STRING(strstart, hash_head); > > + strstart++; > > + INSERT_STRING(strstart, hash_head); > > + } > > do { > > + if (prev_length == 0) break; > > strstart++; > > INSERT_STRING(strstart, hash_head); > > /* strstart never exceeds WSIZE-MAX_MATCH, so there are > > diff --git a/util.c b/util.c > index 0a0fc21..c9f0e52 100644 > --- a/util.c > +++ b/util.c > @@ -38,6 +38,12 @@ > > > static int write_buffer (int, voidp, unsigned int); > > > > +#if defined __ARM_NEON__ || defined __ARM_NEON > > +#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(val)) > > +#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val)) > > +#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val)) > > +#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val)) > > +#else > > /* ======================================================================== > > * Table of CRC-32's of all single-byte values (made by makecrc.c) > > */ > > @@ -95,6 +101,7 @@ static const ulg crc_32_tab[] = { > 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, > > 0x2d02ef8dL > > }; > > +#endif > > > > /* Shift register contents. */ > > static ulg crc = 0xffffffffL; > > @@ -132,6 +139,43 @@ ulg updcrc(s, n) > const uch *s; /* pointer to bytes to pump through */ > > unsigned n; /* number of bytes in s[] */ > > { > > +#if defined __ARM_NEON__ || defined __ARM_NEON > > + register ulg c; > > + static ulg crc = (ulg)0xffffffffL; > > + register const uint8_t *buf1; > > + register const uint16_t *buf2; > > + register const uint32_t *buf4; > > + register const uint64_t *buf8; > > + int64_t length = (int64_t)n; > > + buf8 = (const uint64_t *)(const void *)s; > > + > > + if (s == NULL) { > > + c = 0xffffffffL; > > + } else { > > + c = crc; > > + while(length >= sizeof(uint64_t)) { > > + CRC32D(c, *buf8++); > > + length -= sizeof(uint64_t); > > + } > > + buf4 = (const uint32_t *)(const void *)buf8; > > + if (length >= sizeof(uint32_t)) { > > + CRC32W(c, *buf4++); > > + length -= sizeof(uint32_t); > > + } > > + buf2 = (const uint16_t *)(const void *)buf4; > > + if(length >= sizeof(uint16_t)) { > > + CRC32H(c, *buf2++); > > + length -= sizeof(uint16_t); > > + } > > + buf1 = (const uint8_t *)(const void *)buf2; > > + if (length >= sizeof(uint8_t)) { > > + CRC32B(c, *buf1); > > + length -= sizeof(uint8_t); > > + } > > + } > > + crc = c; > > + return (c ^ 0xffffffffL); > > +#else > > register ulg c; /* temporary variable */ > > > > if (s == NULL) { > > @@ -144,6 +188,7 @@ ulg updcrc(s, n) > } > > crc = c; > > return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ > > +#endif > > } > > > > /* Return a current CRC value. */ > Please allow me to show a set of actual test data for this patch. First, I made an original version of the program "gzip-1.10" based on the gzip-1.10 source code, and then made an optimized version of the program "gzip-optimized" after applying my optimization patch. Next I use gzip-1.10 version to test the compression and decompression time on some **xml** files: [XML]# time ./gzip-1.10 *.xml real 0m5.099s user 0m4.384s sys 0m0.176s [XML]# time ./gzip-1.10 -d *.gz real 0m2.173s user 0m1.821s sys 0m0.348s Then use the optimized version to compare: [XML]# time ./gzip-optimized *.xml real 0m2.785s user 0m2.576s sys 0m0.204s [XML]# time ./gzip-optimized -d *.gz real 0m0.497s user 0m0.176s sys 0m0.320s The next test object is a large **log** file: [LOG]# time ./gzip-1.10 *.log real 0m8.883s user 0m8.652s sys 0m0.217s [LOG]# time ./gzip-1.10 -d *.gz real 0m3.049s user 0m2.604s sys 0m0.439s Also use the optimized version to compare: [LOG]# time ./gzip-optimized *.log real 0m6.882s user 0m6.607s sys 0m0.264s [LOG]# time ./gzip-optimized -d *.gz real 0m1.054s user 0m0.622s sys 0m0.431s The above experimental data are from the aarch64 platform. -- Best regards, Li Qiang
GNU bug tracking system
Copyright (C) 1999 Darren O. Benham,
1997,2003 nCipher Corporation Ltd,
1994-97 Ian Jackson.