GNU bug report logs - #22382
[PATCH] grep: -x now supersedes -w more consistently

Previous Next

Package: grep;

Reported by: Paul Eggert <eggert <at> cs.ucla.edu>

Date: Sat, 16 Jan 2016 07:07:02 UTC

Severity: normal

Tags: patch

Done: Paul Eggert <eggert <at> cs.ucla.edu>

Bug is archived. No further changes may be made.

To add a comment to this bug, you must first unarchive it, by sending
a message to control AT debbugs.gnu.org, with unarchive 22382 in the body.
You can then email your comments to 22382 AT debbugs.gnu.org in the normal way.

Toggle the display of automated, internal messages from the tracker.

View this report as an mbox folder, status mbox, maintainer mbox


Report forwarded to bug-grep <at> gnu.org:
bug#22382; Package grep. (Sat, 16 Jan 2016 07:07:02 GMT) Full text and rfc822 format available.

Acknowledgement sent to Paul Eggert <eggert <at> cs.ucla.edu>:
New bug report received and forwarded. Copy sent to bug-grep <at> gnu.org. (Sat, 16 Jan 2016 07:07:02 GMT) Full text and rfc822 format available.

Message #5 received at submit <at> debbugs.gnu.org (full text, mbox):

From: Paul Eggert <eggert <at> cs.ucla.edu>
To: bug-grep <at> gnu.org
Cc: Paul Eggert <eggert <at> cs.ucla.edu>
Subject: [PATCH] grep: -x now supersedes -w more consistently
Date: Fri, 15 Jan 2016 23:06:22 -0800
* NEWS, doc/grep.texi (Matching Control): Mention this.
* src/dfasearch.c (EGexecute):
* src/pcresearch.c (Pcompile):
Don't get confused by -w if -x is also present.
* src/pcresearch.c (Pcompile): Remove misleading comment about
non-UTF-8 multibyte locales, as PCRE doesn't support them.
Calculate buffer sizes more carefully; the old method
allocated a buffer slightly too big, seemingly due to luck.
* tests/backref-word, tests/pcre: Add tests for this bug.
---
 NEWS               |  5 ++++-
 doc/grep.texi      |  1 +
 src/dfasearch.c    |  6 +++---
 src/pcresearch.c   | 24 +++++++++++++-----------
 tests/backref-word |  4 ++++
 tests/pcre         |  5 ++++-
 6 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/NEWS b/NEWS
index a0f6bbb..9de7fcb 100644
--- a/NEWS
+++ b/NEWS
@@ -32,9 +32,12 @@ GNU grep NEWS                                    -*- outline -*-
   This partly reverts the --exclude-related change in 2.22.
   [bug introduced in grep-2.22]
 
-  --line-buffer is no longer ineffective when combined with -l
+  --line-buffer is no longer ineffective when combined with -l.
   [bug introduced in grep-2.5]
 
+  -xw is now equivalent to -x more consistently, with -P and with backrefs.
+  [bug only partially fixed in grep-2.19]
+
 
 * Noteworthy changes in release 2.22 (2015-11-01) [stable]
 
diff --git a/doc/grep.texi b/doc/grep.texi
index 76769b9..8883b27 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -233,6 +233,7 @@ Similarly,
 it must be either at the end of the line
 or followed by a non-word constituent character.
 Word-constituent characters are letters, digits, and the underscore.
+This option has no effect if @option{-x} is also specified.
 
 @item -x
 @itemx --line-regexp
diff --git a/src/dfasearch.c b/src/dfasearch.c
index a330eac..e04a2df 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -363,14 +363,14 @@ EGexecute (char *buf, size_t size, size_t *match_size,
                   len = end - ptr;
                   goto assess_pattern_match;
                 }
-              /* If -w, check if the match aligns with word boundaries.
-                 We do this iteratively because:
+              /* If -w and not -x, check whether the match aligns with
+                 word boundaries.  Do this iteratively because:
                  (a) the line may contain more than one occurrence of the
                  pattern, and
                  (b) Several alternatives in the pattern might be valid at a
                  given point, and we may need to consider a shorter one to
                  find a word boundary.  */
-              if (match_words)
+              if (!match_lines && match_words)
                 while (match <= best_match)
                   {
                     regoff_t shorter_len = 0;
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 1fae94d..3fee67a 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -98,7 +98,13 @@ Pcompile (char const *pattern, size_t size)
 #else
   int e;
   char const *ep;
-  char *re = xnmalloc (4, size + 7);
+  static char const wprefix[] = "(?<!\\w)(?:";
+  static char const wsuffix[] = ")(?!\\w)";
+  static char const xprefix[] = "^(?:";
+  static char const xsuffix[] = ")$";
+  int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
+                         sizeof xprefix - 1 + sizeof xsuffix - 1);
+  char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
   int flags = (PCRE_MULTILINE
                | (match_icase ? PCRE_CASELESS : 0));
   char const *patlim = pattern + size;
@@ -120,20 +126,16 @@ Pcompile (char const *pattern, size_t size)
     error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
 
   *n = '\0';
-  if (match_lines)
-    strcpy (n, "^(?:");
   if (match_words)
-    strcpy (n, "(?<!\\w)(?:");
+    strcpy (n, wprefix);
+  if (match_lines)
+    strcpy (n, xprefix);
   n += strlen (n);
 
   /* The PCRE interface doesn't allow NUL bytes in the pattern, so
      replace each NUL byte in the pattern with the four characters
      "\000", removing a preceding backslash if there are an odd
-     number of backslashes before the NUL.
-
-     FIXME: This method does not work with some multibyte character
-     encodings, notably Shift-JIS, where a multibyte character can end
-     in a backslash byte.  */
+     number of backslashes before the NUL.  */
   for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
     {
       memcpy (n, p, pnul - p);
@@ -149,9 +151,9 @@ Pcompile (char const *pattern, size_t size)
   n += patlim - p;
   *n = '\0';
   if (match_words)
-    strcpy (n, ")(?!\\w)");
+    strcpy (n, wsuffix);
   if (match_lines)
-    strcpy (n, ")$");
+    strcpy (n, xsuffix);
 
   cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
   if (!cre)
diff --git a/tests/backref-word b/tests/backref-word
index 557c6d8..e5b5486 100755
--- a/tests/backref-word
+++ b/tests/backref-word
@@ -9,6 +9,10 @@ for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do
   out=out1-$LOC
   LC_ALL=$LOC grep -w '\(foo\) \1' exp1 > $out || fail=1
   compare exp1 $out || fail=1
+
+  LC_ALL=$LOC grep -wx '\(foo\) \1' exp1 > $out
+  test $? -eq 1 || fail=1
+  compare /dev/null $out || fail=1
 done
 
 Exit $fail
diff --git a/tests/pcre b/tests/pcre
index a9dfb4b..92e788e 100755
--- a/tests/pcre
+++ b/tests/pcre
@@ -1,5 +1,5 @@
 #! /bin/sh
-# Ensure that with -P, \s matches a newline.
+# Simple PCRE tests.
 #
 # Copyright (C) 2001, 2006, 2009-2016 Free Software Foundation, Inc.
 #
@@ -15,4 +15,7 @@ fail=0
 echo | grep -P '\s*$' || fail=1
 echo | grep -zP '\s$' || fail=1
 
+echo '.ab' | grep -Pwx ab
+test $? -eq 1 || fail=1
+
 Exit $fail
-- 
2.5.0





bug closed, send any further explanations to 22382 <at> debbugs.gnu.org and Paul Eggert <eggert <at> cs.ucla.edu> Request was from Paul Eggert <eggert <at> cs.ucla.edu> to control <at> debbugs.gnu.org. (Sat, 16 Jan 2016 07:09:01 GMT) Full text and rfc822 format available.

bug archived. Request was from Debbugs Internal Request <help-debbugs <at> gnu.org> to internal_control <at> debbugs.gnu.org. (Sat, 13 Feb 2016 12:24:04 GMT) Full text and rfc822 format available.

This bug report was last modified 9 years and 133 days ago.

Previous Next


GNU bug tracking system
Copyright (C) 1999 Darren O. Benham, 1997,2003 nCipher Corporation Ltd, 1994-97 Ian Jackson.