Package: gzip;
Reported by: l00374334 <liqiang64 <at> huawei.com>
Date: Tue, 26 May 2020 05:18:02 UTC
Severity: normal
Tags: patch, wontfix
Message #11 received at 41535 <at> debbugs.gnu.org (full text, mbox):
From: Li Qiang <liqiang64 <at> huawei.com> To: <41535 <at> debbugs.gnu.org> Cc: meyering <at> fb.com, eggert <at> cs.ucla.edu Subject: Re: bug#41535: [PATCH] performance optimization for aarch64 Date: Thu, 20 Aug 2020 16:55:26 +0800
在 2020/5/30 17:17, Li Qiang 写道: > > > 在 2020/5/26 10:39, l00374334 写道: >> From: liqiang <liqiang64 <at> huawei.com> >> >> By analyzing the compression and decompression process of gzip, I found >> >> that the hot spots of CRC32 and longest_match function are very high. >> >> >> >> On the aarch64 architecture, we can optimize the efficiency of crc32 >> >> through the interface provided by the neon instruction set (12x faster >> >> in aarch64), and optimize the performance of random access code through >> >> prefetch instructions (about 5%~8% improvement). In some compression >> >> scenarios, loop expansion can also get a certain performance improvement >> >> (about 10%). >> >> >> >> Modify by Li Qiang. >> >> --- >> configure | 14 ++++++++++++++ >> deflate.c | 30 +++++++++++++++++++++++++++++- >> util.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ >> 3 files changed, 88 insertions(+), 1 deletion(-) >> >> diff --git a/configure b/configure >> index cab3daf..dc80cb6 100644 >> --- a/configure >> +++ b/configure >> @@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext >> ;; >> >> arm* | aarch64 ) >> + cat confdefs.h - <<_ACEOF >conftest.$ac_ext >> +/* end confdefs.h. */ >> +#if defined __ARM_NEON__ || defined __ARM_NEON >> + int ok; >> + #else >> + error fail >> + #endif >> + >> +_ACEOF >> +if ac_fn_c_try_compile "$LINENO" >> +then : >> + CFLAGS="$CFLAGS -march=armv8-a+crc" >> +fi >> +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext >> # Assume arm with EABI. >> # On arm64 systems, the C compiler may be generating code in one of >> # these ABIs: >> diff --git a/deflate.c b/deflate.c >> index 9d379e9..ee77ffd 100644 >> --- a/deflate.c >> +++ b/deflate.c >> @@ -378,6 +378,9 @@ longest_match(IPos cur_match) >> register int len; /* length of current match */ >> >> int best_len = prev_length; /* best match length so far */ >> >> IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL; >> >> +#ifdef __aarch64__ >> >> + IPos next_match; >> >> +#endif >> >> /* Stop when cur_match becomes <= limit. To simplify the code, >> >> * we prevent matches with the string of window index 0. >> >> */ >> >> @@ -411,6 +414,10 @@ longest_match(IPos cur_match) >> do { >> >> Assert(cur_match < strstart, "no future"); >> >> match = window + cur_match; >> >> +#ifdef __aarch64__ >> >> + next_match = prev[cur_match & WMASK]; >> >> + __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK]))); >> >> +#endif >> >> >> >> /* Skip to next match if the match length cannot increase >> >> * or if the match length is less than 2: >> >> @@ -488,8 +495,14 @@ longest_match(IPos cur_match) >> scan_end = scan[best_len]; >> >> #endif >> >> } >> >> - } while ((cur_match = prev[cur_match & WMASK]) > limit >> >> + } >> >> +#ifdef __aarch64__ >> >> + while ((cur_match = next_match) > limit >> >> + && --chain_length != 0); >> >> +#else >> >> + while ((cur_match = prev[cur_match & WMASK]) > limit >> >> && --chain_length != 0); >> >> +#endif >> >> >> >> return best_len; >> >> } >> >> @@ -777,7 +790,22 @@ deflate (int pack_level) >> lookahead -= prev_length-1; >> >> prev_length -= 2; >> >> RSYNC_ROLL(strstart, prev_length+1); >> >> + while (prev_length >= 4) { >> >> + /* After actual verification, expanding this loop >> >> + * can improve its performance in certain scenarios. >> >> + */ >> >> + prev_length -= 4; >> >> + strstart++; >> >> + INSERT_STRING(strstart, hash_head); >> >> + strstart++; >> >> + INSERT_STRING(strstart, hash_head); >> >> + strstart++; >> >> + INSERT_STRING(strstart, hash_head); >> >> + strstart++; >> >> + INSERT_STRING(strstart, hash_head); >> >> + } >> >> do { >> >> + if (prev_length == 0) break; >> >> strstart++; >> >> INSERT_STRING(strstart, hash_head); >> >> /* strstart never exceeds WSIZE-MAX_MATCH, so there are >> >> diff --git a/util.c b/util.c >> index 0a0fc21..c9f0e52 100644 >> --- a/util.c >> +++ b/util.c >> @@ -38,6 +38,12 @@ >> >> >> static int write_buffer (int, voidp, unsigned int); >> >> >> >> +#if defined __ARM_NEON__ || defined __ARM_NEON >> >> +#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(val)) >> >> +#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val)) >> >> +#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val)) >> >> +#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val)) >> >> +#else >> >> /* ======================================================================== >> >> * Table of CRC-32's of all single-byte values (made by makecrc.c) >> >> */ >> >> @@ -95,6 +101,7 @@ static const ulg crc_32_tab[] = { >> 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, >> >> 0x2d02ef8dL >> >> }; >> >> +#endif >> >> >> >> /* Shift register contents. */ >> >> static ulg crc = 0xffffffffL; >> >> @@ -132,6 +139,43 @@ ulg updcrc(s, n) >> const uch *s; /* pointer to bytes to pump through */ >> >> unsigned n; /* number of bytes in s[] */ >> >> { >> >> +#if defined __ARM_NEON__ || defined __ARM_NEON >> >> + register ulg c; >> >> + static ulg crc = (ulg)0xffffffffL; >> >> + register const uint8_t *buf1; >> >> + register const uint16_t *buf2; >> >> + register const uint32_t *buf4; >> >> + register const uint64_t *buf8; >> >> + int64_t length = (int64_t)n; >> >> + buf8 = (const uint64_t *)(const void *)s; >> >> + >> >> + if (s == NULL) { >> >> + c = 0xffffffffL; >> >> + } else { >> >> + c = crc; >> >> + while(length >= sizeof(uint64_t)) { >> >> + CRC32D(c, *buf8++); >> >> + length -= sizeof(uint64_t); >> >> + } >> >> + buf4 = (const uint32_t *)(const void *)buf8; >> >> + if (length >= sizeof(uint32_t)) { >> >> + CRC32W(c, *buf4++); >> >> + length -= sizeof(uint32_t); >> >> + } >> >> + buf2 = (const uint16_t *)(const void *)buf4; >> >> + if(length >= sizeof(uint16_t)) { >> >> + CRC32H(c, *buf2++); >> >> + length -= sizeof(uint16_t); >> >> + } >> >> + buf1 = (const uint8_t *)(const void *)buf2; >> >> + if (length >= sizeof(uint8_t)) { >> >> + CRC32B(c, *buf1); >> >> + length -= sizeof(uint8_t); >> >> + } >> >> + } >> >> + crc = c; >> >> + return (c ^ 0xffffffffL); >> >> +#else >> >> register ulg c; /* temporary variable */ >> >> >> >> if (s == NULL) { >> >> @@ -144,6 +188,7 @@ ulg updcrc(s, n) >> } >> >> crc = c; >> >> return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ >> >> +#endif >> >> } >> >> >> >> /* Return a current CRC value. */ >> > > Please allow me to show a set of actual test data for this patch. > > First, I made an original version of the program "gzip-1.10" based > on the gzip-1.10 source code, and then made an optimized version of > the program "gzip-optimized" after applying my optimization patch. > > Next I use gzip-1.10 version to test the compression and decompression > time on some **xml** files: > [XML]# time ./gzip-1.10 *.xml > > real 0m5.099s > user 0m4.384s > sys 0m0.176s > [XML]# time ./gzip-1.10 -d *.gz > > real 0m2.173s > user 0m1.821s > sys 0m0.348s > > Then use the optimized version to compare: > [XML]# time ./gzip-optimized *.xml > > real 0m2.785s > user 0m2.576s > sys 0m0.204s > [XML]# time ./gzip-optimized -d *.gz > > real 0m0.497s > user 0m0.176s > sys 0m0.320s > > > The next test object is a large **log** file: > [LOG]# time ./gzip-1.10 *.log > > real 0m8.883s > user 0m8.652s > sys 0m0.217s > [LOG]# time ./gzip-1.10 -d *.gz > > real 0m3.049s > user 0m2.604s > sys 0m0.439s > > Also use the optimized version to compare: > [LOG]# time ./gzip-optimized *.log > > real 0m6.882s > user 0m6.607s > sys 0m0.264s > [LOG]# time ./gzip-optimized -d *.gz > > real 0m1.054s > user 0m0.622s > sys 0m0.431s > > The above experimental data are from the aarch64 platform. > Gentle ping. : ) -- Best regards, Li Qiang
GNU bug tracking system
Copyright (C) 1999 Darren O. Benham,
1997,2003 nCipher Corporation Ltd,
1994-97 Ian Jackson.