GNU bug report logs - #58558
29.0.50; re-search-forward is slow in some buffers

Previous Next

Package: emacs;

Reported by: Ihor Radchenko <yantar92 <at> posteo.net>

Date: Sun, 16 Oct 2022 01:27:02 UTC

Severity: normal

Found in version 29.0.50

Full log


Message #224 received at 58558 <at> debbugs.gnu.org (full text, mbox):

From: Stefan Monnier <monnier <at> iro.umontreal.ca>
To: Ihor Radchenko <yantar92 <at> posteo.net>
Cc: 58558 <at> debbugs.gnu.org, Alan Mackenzie <acm <at> muc.de>,
 Eli Zaretskii <eliz <at> gnu.org>, larsi <at> gnus.org
Subject: Re: bug#58558: 29.0.50; re-search-forward is slow in some buffers
Date: Wed, 12 Apr 2023 19:23:19 -0400
[Message part 1 (text/plain, inline)]
> For the former, we could probably extend the `b_property` and
> `e_property` fields of `gl_state` (which hold charpos) to also store
> their bytepos equivalent, which should significantly reduce the number
> of conversions between bytepos and charpos.

I.e. something like the patch below (which passes all tests except for
`test/src/comp-tests` for a reason that completely escapes me).


        Stefan
[regmatch.patch (text/x-diff, inline)]
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 746779490ad..f75f805cd9c 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3979,7 +3979,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 
   /* Prevent shrinking and relocation of buffer text if GC happens
      while we are inside this function.  The calls to
-     UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
+     RE_UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
      `internal--syntax-propertize`); these calls are careful to defend against
      buffer modifications, but even with no modifications, the buffer text may
      be relocated during GC by `compact_buffer` which would invalidate
@@ -4792,12 +4792,11 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 		int s1, s2;
 		int dummy;
                 ptrdiff_t offset = POINTER_TO_OFFSET (d);
-                ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
-		UPDATE_SYNTAX_TABLE (charpos);
+		RE_UPDATE_SYNTAX_TABLE_BEFORE (offset);
 		GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
 		nchars++;
 		s1 = SYNTAX (c1);
-		UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+		RE_UPDATE_SYNTAX_TABLE_FORWARD (offset);
 		PREFETCH_NOLIMIT ();
 		GET_CHAR_AFTER (c2, d, dummy);
 		nchars++;
@@ -4832,8 +4831,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 	      int s1, s2;
 	      int dummy;
 	      ptrdiff_t offset = POINTER_TO_OFFSET (d);
-	      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
-	      UPDATE_SYNTAX_TABLE (charpos);
+	      RE_UPDATE_SYNTAX_TABLE (offset);
 	      PREFETCH ();
 	      GET_CHAR_AFTER (c2, d, dummy);
 	      nchars++;
@@ -4848,7 +4846,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 		{
 		  GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
 		  nchars++;
-		  UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+		  RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (offset);
 		  s1 = SYNTAX (c1);
 
 		  /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
@@ -4875,8 +4873,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 	      int s1, s2;
 	      int dummy;
               ptrdiff_t offset = POINTER_TO_OFFSET (d);
-              ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
-	      UPDATE_SYNTAX_TABLE (charpos);
+	      RE_UPDATE_SYNTAX_TABLE_BEFORE (offset);
 	      GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
 	      nchars++;
 	      s1 = SYNTAX (c1);
@@ -4891,13 +4888,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 		  PREFETCH_NOLIMIT ();
 		  GET_CHAR_AFTER (c2, d, dummy);
 		  nchars++;
-                  UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+                  RE_UPDATE_SYNTAX_TABLE_FORWARD (offset);
 		  s2 = SYNTAX (c2);
 
 		  /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
 		     returns 0.  */
 		  if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
-	  goto fail;
+		    goto fail;
 		}
 	    }
 	  break;
@@ -4917,8 +4914,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 	      int c1, c2;
 	      int s1, s2;
 	      ptrdiff_t offset = POINTER_TO_OFFSET (d);
-	      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
-	      UPDATE_SYNTAX_TABLE (charpos);
+	      RE_UPDATE_SYNTAX_TABLE (offset);
 	      PREFETCH ();
 	      c2 = RE_STRING_CHAR (d, target_multibyte);
 	      nchars++;
@@ -4933,7 +4929,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 		{
 		  GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
 		  nchars++;
-		  UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+		  RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (offset);
 		  s1 = SYNTAX (c1);
 
 		  /* ... and S1 is Sword or Ssymbol.  */
@@ -4958,8 +4954,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 	      int c1, c2;
 	      int s1, s2;
               ptrdiff_t offset = POINTER_TO_OFFSET (d);
-              ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
-	      UPDATE_SYNTAX_TABLE (charpos);
+	      RE_UPDATE_SYNTAX_TABLE_BEFORE (offset);
 	      GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
 	      nchars++;
 	      s1 = SYNTAX (c1);
@@ -4974,7 +4969,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 		  PREFETCH_NOLIMIT ();
 		  c2 = RE_STRING_CHAR (d, target_multibyte);
 		  nchars++;
-		  UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+		  RE_UPDATE_SYNTAX_TABLE_FORWARD (offset);
 		  s2 = SYNTAX (c2);
 
 		  /* ... and S2 is Sword or Ssymbol.  */
@@ -4994,8 +4989,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 	    PREFETCH ();
 	    {
 	      ptrdiff_t offset = POINTER_TO_OFFSET (d);
-	      ptrdiff_t pos1 = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
-	      UPDATE_SYNTAX_TABLE (pos1);
+	      RE_UPDATE_SYNTAX_TABLE (offset);
 	    }
 	    {
 	      int len;
diff --git a/src/syntax.c b/src/syntax.c
index e9e04e2d638..0245038dc2d 100644
--- a/src/syntax.c
+++ b/src/syntax.c
@@ -250,6 +250,8 @@ SETUP_SYNTAX_TABLE (ptrdiff_t from, ptrdiff_t count)
   gl_state.b_property = BEGV;
   gl_state.e_property = ZV + 1;
   gl_state.object = Qnil;
+  gl_state.b_re_byte = -1;
+  gl_state.e_re_byte = -1;
   if (parse_sexp_lookup_properties)
     {
       if (count > 0)
@@ -268,11 +270,12 @@ SETUP_SYNTAX_TABLE (ptrdiff_t from, ptrdiff_t count)
    FROMBYTE is an regexp-byteoffset.  */
 
 void
-RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object,
-			          ptrdiff_t frombyte)
+RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object, ptrdiff_t frombyte)
 {
   SETUP_BUFFER_SYNTAX_TABLE ();
   gl_state.object = object;
+  gl_state.b_re_byte = -1;
+  gl_state.e_re_byte = -1;
   if (BUFFERP (gl_state.object))
     {
       struct buffer *buf = XBUFFER (gl_state.object);
@@ -282,7 +285,7 @@ RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object,
   else if (NILP (gl_state.object))
     {
       gl_state.b_property = BEG;
-      gl_state.e_property = ZV; /* FIXME: Why not +1 like in SETUP_SYNTAX_TABLE? */
+      gl_state.e_property = ZV;
     }
   else if (EQ (gl_state.object, Qt))
     {
@@ -295,8 +298,11 @@ RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object,
       gl_state.e_property = 1 + SCHARS (gl_state.object);
     }
   if (parse_sexp_lookup_properties)
-    update_syntax_table (RE_SYNTAX_TABLE_BYTE_TO_CHAR (frombyte),
-			 1, 1, gl_state.object);
+    {
+      update_syntax_table (RE_SYNTAX_TABLE_BYTE_TO_CHAR (frombyte),
+			   1, 1, gl_state.object);
+      re_update_byteoffsets ();
+    }
 }
 
 /* Update gl_state to an appropriate interval which contains CHARPOS.  The
diff --git a/src/syntax.h b/src/syntax.h
index 01982be25a0..3e20952053b 100644
--- a/src/syntax.h
+++ b/src/syntax.h
@@ -66,7 +66,7 @@ #define Vstandard_syntax_table BVAR (&buffer_defaults, syntax_table)
 struct gl_state_s
 {
   Lisp_Object object;			/* The object we are scanning.  */
-  ptrdiff_t start;			/* Where to stop.  */
+  ptrdiff_t start;			/* Where to stop(?FIXME?).  */
   ptrdiff_t stop;			/* Where to stop.  */
   bool use_global;			/* Whether to use global_code
 					   or c_s_t.  */
@@ -85,6 +85,11 @@ #define Vstandard_syntax_table BVAR (&buffer_defaults, syntax_table)
 					   and possibly at the
 					   intervals too, depending
 					   on:  */
+  /* The regexp engine prefers byteoffsets over char positions, so
+     store those to try and reduce the number of byte<->char conversions.
+     This is only kept uptodate when used from the regexp engine.  */
+  ptrdiff_t b_re_byte;      /* First byteoffset where c_s_t is valid.  */
+  ptrdiff_t e_re_byte;      /* First byteoffset where c_s_t is not valid.  */
 };
 
 extern struct gl_state_s gl_state;
@@ -145,19 +150,14 @@ SYNTAX (int c)
 
 extern unsigned char const syntax_spec_code[0400];
 
-/* Convert the regexp's BYTEOFFSET into a character position,
-   for the object recorded in gl_state with RE_SETUP_SYNTAX_TABLE_FOR_OBJECT.
-
-   The value is meant for use in code that does nothing when
-   parse_sexp_lookup_properties is false, so return 0 in that case,
-   for speed.  */
+/* Convert the regexp's BYTEOFFSET into a character position, for
+   the object recorded in gl_state with RE_SETUP_SYNTAX_TABLE_FOR_OBJECT.  */
 
 INLINE ptrdiff_t
 RE_SYNTAX_TABLE_BYTE_TO_CHAR (ptrdiff_t byteoffset)
 {
-  return (! parse_sexp_lookup_properties
-	  ? 0
-	  : STRINGP (gl_state.object)
+  eassert (parse_sexp_lookup_properties);
+  return (STRINGP (gl_state.object)
 	  ? string_byte_to_char (gl_state.object, byteoffset)
 	  : BUFFERP (gl_state.object)
 	  ? ((buf_bytepos_to_charpos
@@ -168,6 +168,44 @@ RE_SYNTAX_TABLE_BYTE_TO_CHAR (ptrdiff_t byteoffset)
 	  : byteoffset);
 }
 
+INLINE ptrdiff_t
+RE_SYNTAX_TABLE_CHAR_TO_BYTE (ptrdiff_t charpos)
+{
+  eassert (parse_sexp_lookup_properties);
+  return (STRINGP (gl_state.object)
+	  ? string_char_to_byte (gl_state.object, charpos)
+	  : BUFFERP (gl_state.object)
+	  ? ((buf_charpos_to_bytepos
+	      (XBUFFER (gl_state.object), charpos)
+	      - BUF_BEGV_BYTE (XBUFFER (gl_state.object))))
+	  : NILP (gl_state.object)
+	  ? CHAR_TO_BYTE (charpos) - BEGV_BYTE
+	  : charpos);
+}
+
+static void re_update_byteoffsets (void)
+{
+  gl_state.b_re_byte = RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.b_property);
+  eassert (gl_state.b_property
+           == RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.b_re_byte));
+  /* `e_property` is often set to EOB+1 (or to some value
+     much further than `stop` in narrowed buffers).  */
+  gl_state.e_re_byte
+    = gl_state.e_property > gl_state.stop
+      ? 1 + RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.stop)
+      : RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.e_property);
+  eassert (gl_state.e_property > gl_state.stop
+           ? gl_state.e_property
+             >= 1 + RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.e_re_byte - 1)
+           : gl_state.e_property
+             == RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.e_re_byte));
+}
+
+/* The regexp-engine doesn't keep track of char positions, but instead
+   uses byteoffsets, so `syntax.c` uses `UPDATE_SYNTAX_TABLE_*` functions,
+   passing them `charpos`s whereas `regexp.c` uses `RE_UPDATE_SYNTAX_TABLE_*`
+   functions, passing them byteoffsets.  */
+
 /* Make syntax table state (gl_state) good for CHARPOS, assuming it is
    currently good for a position before CHARPOS.  */
 
@@ -178,6 +216,36 @@ UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t charpos)
     update_syntax_table_forward (charpos, false, gl_state.object);
 }
 
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t byteoffset)
+{ /* Performs just-in-time syntax-propertization.  */
+  if (!parse_sexp_lookup_properties)
+    return;
+  eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative.  */
+  if (byteoffset >= gl_state.e_re_byte)
+    {
+      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset);
+      eassert (charpos >= gl_state.e_property);
+      UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+      re_update_byteoffsets ();
+    }
+}
+
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_FORWARD_BEFORE (ptrdiff_t byteoffset)
+{ /* Performs just-in-time syntax-propertization.  */
+  if (!parse_sexp_lookup_properties)
+    return;
+  eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative.  */
+  if (byteoffset > gl_state.e_re_byte)
+    {
+      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset) - 1;
+      eassert (charpos >= gl_state.e_property);
+      UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+      re_update_byteoffsets ();
+    }
+}
+
 /* Make syntax table state (gl_state) good for CHARPOS, assuming it is
    currently good for a position after CHARPOS.  */
 
@@ -188,6 +256,36 @@ UPDATE_SYNTAX_TABLE_BACKWARD (ptrdiff_t charpos)
     update_syntax_table (charpos, -1, false, gl_state.object);
 }
 
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_BACKWARD (ptrdiff_t byteoffset)
+{
+  if (!parse_sexp_lookup_properties)
+    return;
+  eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative.  */
+  if (byteoffset < gl_state.b_re_byte)
+    {
+      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset);
+      eassert (charpos < gl_state.b_property);
+      UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+      re_update_byteoffsets ();
+    }
+}
+
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (ptrdiff_t byteoffset)
+{
+  if (!parse_sexp_lookup_properties)
+    return;
+  eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative.  */
+  if (byteoffset <= gl_state.b_re_byte)
+    {
+      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset);
+      eassert (charpos <= gl_state.b_property);
+      UPDATE_SYNTAX_TABLE_FORWARD (charpos - 1);
+      re_update_byteoffsets ();
+    }
+}
+
 /* Make syntax table good for CHARPOS.  */
 
 INLINE void
@@ -197,6 +295,20 @@ UPDATE_SYNTAX_TABLE (ptrdiff_t charpos)
   UPDATE_SYNTAX_TABLE_FORWARD (charpos);
 }
 
+INLINE void
+RE_UPDATE_SYNTAX_TABLE (ptrdiff_t byteoffset)
+{
+  RE_UPDATE_SYNTAX_TABLE_BACKWARD (byteoffset);
+  RE_UPDATE_SYNTAX_TABLE_FORWARD  (byteoffset);
+}
+
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_BEFORE (ptrdiff_t byteoffset)
+{
+  RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (byteoffset);
+  RE_UPDATE_SYNTAX_TABLE_FORWARD_BEFORE  (byteoffset);
+}
+
 /* Set up the buffer-global syntax table.  */
 
 INLINE void

This bug report was last modified 2 years and 63 days ago.

Previous Next


GNU bug tracking system
Copyright (C) 1999 Darren O. Benham, 1997,2003 nCipher Corporation Ltd, 1994-97 Ian Jackson.