GNU bug report logs - #65211
30.0.50; threads and accept-process-output

Previous Next

Package: emacs;

Reported by: Helmut Eller <eller.helmut <at> gmail.com>

Date: Thu, 10 Aug 2023 15:06:02 UTC

Severity: normal

Tags: patch

Found in version 30.0.50

To reply to this bug, email your comments to 65211 AT debbugs.gnu.org.

Toggle the display of automated, internal messages from the tracker.

View this report as an mbox folder, status mbox, maintainer mbox


Report forwarded to bug-gnu-emacs <at> gnu.org:
bug#65211; Package emacs. (Thu, 10 Aug 2023 15:06:02 GMT) Full text and rfc822 format available.

Acknowledgement sent to Helmut Eller <eller.helmut <at> gmail.com>:
New bug report received and forwarded. Copy sent to bug-gnu-emacs <at> gnu.org. (Thu, 10 Aug 2023 15:06:02 GMT) Full text and rfc822 format available.

Message #5 received at submit <at> debbugs.gnu.org (full text, mbox):

From: Helmut Eller <eller.helmut <at> gmail.com>
To: bug-gnu-emacs <at> gnu.org
Subject: 30.0.50; threads and accept-process-output
Date: Thu, 10 Aug 2023 17:04:48 +0200
[Message part 1 (text/plain, inline)]
The test case from below hangs when executed with

  emacs -Q --batch -l apo-tests.el -f ert-run-tests-batch-and-exit

It's not 100% reproducible, but it seems to hang 80% of the time.

The test is fairly complicated and involves a TLS connection, a
sub-process and a background thread.  The background thread starts a
sub-process and reads all its output until the process terminates.  Then
the main thread opens a TLS connection, sends a HTTP request and tries
to read the response.  At this point, accept-process-output hangs even
though there is output available.

I think the reason for this problem is that the sub-process uses a file
descriptor (usually number 6) and sets
fd_callback_info[6].waiting_thread to the background thread (in
compute_non_keyboard_wait_mask) .  Later, the TLS connection receives a
file descriptor with the same number (6) because the sub-process closed
it already.  That would be fine, however
fd_callback_info[6].waiting_thread is still set to the background
thread.  So this time compute_non_keyboard_wait_mask doesn't include 6
in the wait mask.  Because 6 is not in the wait mask,
emacs_gnutls_record_check_pending will not be called and Emacs doesn't
notice the buffered output.

The intention in the code seems to be that clear_waiting_thread_info
resets fd_callback_info[6].waiting_thread to NULL, however by the time
that clear_waiting_thread_info is called, max_desc was reduced to 4,
because somebody closed file descriptors 5 and 6.  So
clear_waiting_thread_info doesn't touch fd_callback_info[6].

A possible solution would be to reset the .waiting_thread field in
delete_read_fd, like so:

diff --git a/src/process.c b/src/process.c
index 08cb810ec13..74d0bf252ab 100644
--- a/src/process.c
+++ b/src/process.c
@@ -513,6 +513,9 @@ delete_read_fd (int fd)
     {
       fd_callback_info[fd].func = 0;
       fd_callback_info[fd].data = 0;
+
+      if (fd_callback_info[fd].waiting_thread == current_thread)
+	fd_callback_info[fd].waiting_thread = NULL;
     }
 }


With this change, the test doesn't hang.

Helmut

[apo-tests.el (text/plain, inline)]
;; -*- lexical-binding: t -*-

(require 'ert)
(require 'gnutls)

(defun t--read (proc nchars)
  "Read at least NCHARS characters from process PROC.
Return 'ok on success; otherwise return 'not-live."
  (let* ((buf (process-buffer proc))
	 (size (+ (buffer-size buf) nchars))
	 (result nil))
    (t--accept-process-output proc 0)
    (while (eq result nil)
      (cond ((>= (buffer-size buf) size)
	     (setq result 'ok))
	    ((not (process-live-p proc))
	     (setq result 'not-live))
	    (t
	     (t--accept-process-output proc nil))))
    result))

;; a wrapper around accept-process-output for debugging
(defun t--accept-process-output (proc timeout)
  (let ((buf (process-buffer proc)))
    (message "%s accept-process-output %S %s %s …"
	     (current-thread) proc (process-status proc) timeout)
    (let ((val (accept-process-output proc timeout)))
      (message "%s accept-process-output %S %s %s => %S %d"
	       (current-thread) proc (process-status proc) timeout
	       val (buffer-size buf))
      val)))

(defun t--read-all (proc)
  (while (pcase-exhaustive (t--read proc 1)
	   ('ok t)
	   ('not-live nil)))
  (with-current-buffer (process-buffer proc)
    (buffer-string)))

(defun t--make-buffer (name)
  (with-current-buffer (generate-new-buffer name t)
    (buffer-disable-undo)
    (set-buffer-multibyte nil)
    (current-buffer)))

(ert-deftest t-tls ()
  (thread-join
   (make-thread
    (lambda ()
      (let* ((proc (make-process
		    :name "cat"
		    :command (list "cat")
		    :sentinel (lambda (_ _))
		    :buffer (t--make-buffer "cat"))))
	(process-send-eof proc)
	(should (equal (t--read-all proc) ""))))))
  (let* ((host "www.example.net")
	 (proc (make-network-process
		:name "tls"
		:host host
		:service 443
		:tls-parameters (cons 'gnutls-x509pki
				      (gnutls-boot-parameters
				       :hostname host))
		:buffer (t--make-buffer "tls")
		:sentinel (lambda (_ _))))
	 (request (format "GET / HTTP/1.1\r\nHost: %s\r\n\r\n" host)))
    (process-send-string proc request)
    (process-send-eof proc)
    (let* ((response (t--read-all proc)))
      (should (string-match "^HTTP/1.1 200 OK" response)))))

;; Local Variables:
;; read-symbol-shorthands: (("t-" . "apo-tests-"))
;; End:
[Message part 3 (text/plain, inline)]

In GNU Emacs 30.0.50 (build 71, x86_64-pc-linux-gnu, GTK+ Version
 3.24.37, cairo version 1.16.0) of 2023-08-10 built on caladan
Repository revision: 164588b174774eba0c3bd6999633a39bed748195
Repository branch: master
Windowing system distributor 'The X.Org Foundation', version 11.0.12101007
System Description: Debian GNU/Linux 12 (bookworm)

Configured using:
 'configure --enable-checking=yes --with-xpm=ifavailable
 --with-gif=ifavailable 'CFLAGS=-g -O1''

Configured features:
CAIRO DBUS FREETYPE GLIB GMP GNUTLS GSETTINGS HARFBUZZ JPEG LIBSELINUX
LIBSYSTEMD LIBXML2 MODULES NOTIFY INOTIFY PDUMPER PNG SECCOMP SOUND
SQLITE3 THREADS TIFF TOOLKIT_SCROLL_BARS WEBP X11 XDBE XIM XINPUT2 GTK3
ZLIB

Important settings:
  value of $LANG: C.UTF-8
  locale-coding-system: utf-8-unix

Information forwarded to bug-gnu-emacs <at> gnu.org:
bug#65211; Package emacs. (Thu, 10 Aug 2023 15:22:02 GMT) Full text and rfc822 format available.

Message #8 received at 65211 <at> debbugs.gnu.org (full text, mbox):

From: Robert Pluim <rpluim <at> gmail.com>
To: Helmut Eller <eller.helmut <at> gmail.com>
Cc: 65211 <at> debbugs.gnu.org
Subject: Re: bug#65211: 30.0.50; threads and accept-process-output
Date: Thu, 10 Aug 2023 17:21:28 +0200
>>>>> On Thu, 10 Aug 2023 17:04:48 +0200, Helmut Eller <eller.helmut <at> gmail.com> said:

    Helmut> The test case from below hangs when executed with
    Helmut>   emacs -Q --batch -l apo-tests.el -f ert-run-tests-batch-and-exit

    Helmut> It's not 100% reproducible, but it seems to hang 80% of the time.

    Helmut> The test is fairly complicated and involves a TLS connection, a
    Helmut> sub-process and a background thread.  The background thread starts a
    Helmut> sub-process and reads all its output until the process terminates.  Then
    Helmut> the main thread opens a TLS connection, sends a HTTP request and tries
    Helmut> to read the response.  At this point, accept-process-output hangs even
    Helmut> though there is output available.

    Helmut> I think the reason for this problem is that the sub-process uses a file
    Helmut> descriptor (usually number 6) and sets
    Helmut> fd_callback_info[6].waiting_thread to the background thread (in
    Helmut> compute_non_keyboard_wait_mask) .  Later, the TLS connection receives a
    Helmut> file descriptor with the same number (6) because the sub-process closed
    Helmut> it already.  That would be fine, however
    Helmut> fd_callback_info[6].waiting_thread is still set to the background
    Helmut> thread.  So this time compute_non_keyboard_wait_mask doesn't include 6
    Helmut> in the wait mask.  Because 6 is not in the wait mask,
    Helmut> emacs_gnutls_record_check_pending will not be called and Emacs doesn't
    Helmut> notice the buffered output.

    Helmut> The intention in the code seems to be that clear_waiting_thread_info
    Helmut> resets fd_callback_info[6].waiting_thread to NULL, however by the time
    Helmut> that clear_waiting_thread_info is called, max_desc was reduced to 4,
    Helmut> because somebody closed file descriptors 5 and 6.  So
    Helmut> clear_waiting_thread_info doesn't touch fd_callback_info[6].

    Helmut> A possible solution would be to reset the .waiting_thread field in
    Helmut> delete_read_fd, like so:

    Helmut> diff --git a/src/process.c b/src/process.c
    Helmut> index 08cb810ec13..74d0bf252ab 100644
    Helmut> --- a/src/process.c
    Helmut> +++ b/src/process.c
    Helmut> @@ -513,6 +513,9 @@ delete_read_fd (int fd)
    Helmut>      {
    Helmut>        fd_callback_info[fd].func = 0;
    Helmut>        fd_callback_info[fd].data = 0;
    Helmut> +
    Helmut> +      if (fd_callback_info[fd].waiting_thread == current_thread)
    Helmut> +	fd_callback_info[fd].waiting_thread = NULL;
    Helmut>      }
    Helmut>  }

Hmm. Maybe putting it in `deactivate_process' is better (not that Iʼve
tested 😺)

Robert
-- 




Information forwarded to bug-gnu-emacs <at> gnu.org:
bug#65211; Package emacs. (Thu, 10 Aug 2023 16:26:01 GMT) Full text and rfc822 format available.

Message #11 received at 65211 <at> debbugs.gnu.org (full text, mbox):

From: Helmut Eller <eller.helmut <at> gmail.com>
To: Robert Pluim <rpluim <at> gmail.com>
Cc: 65211 <at> debbugs.gnu.org
Subject: Re: bug#65211: 30.0.50; threads and accept-process-output
Date: Thu, 10 Aug 2023 18:24:52 +0200
On Thu, Aug 10 2023, Robert Pluim wrote:

> Hmm. Maybe putting it in `deactivate_process' is better (not that Iʼve
> tested 😺)

That's for the maintainers to decide.  It seems to work too.  To me,
delete_read_fd feels like the appropriate place.

For symmetry, delete_write_fd should probably also do something with
.waiting_thread, but coming up with a test case for that is beyond my
understanding of this code.

Helmut




Information forwarded to bug-gnu-emacs <at> gnu.org:
bug#65211; Package emacs. (Fri, 11 Aug 2023 08:34:02 GMT) Full text and rfc822 format available.

Message #14 received at 65211 <at> debbugs.gnu.org (full text, mbox):

From: Helmut Eller <eller.helmut <at> gmail.com>
To: Robert Pluim <rpluim <at> gmail.com>
Cc: 65211 <at> debbugs.gnu.org
Subject: Re: bug#65211: 30.0.50; threads and accept-process-output
Date: Fri, 11 Aug 2023 10:32:53 +0200
[Message part 1 (text/plain, inline)]
On Thu, Aug 10 2023, Robert Pluim wrote:

> Hmm. Maybe putting it in `deactivate_process' is better (not that Iʼve
> tested 😺)

After a night of sleep, it occurred to me that this could also be
considered a problem of incomplete initialization of the
fd_callback_info[fd] struct.

So I wondered who is supposed to initialize it.  Unfortunately, there is
half a dozen of add_<foo>_fd functions that set various bits, some call
each other, some duplicate code.  None of them sets the waiting_thread
slot.  And there is no single point that clears everything out (except
init_process_emacs).

I also discovered that recompute_max_desc considers an fd unused if the
.flags field is zero.  Luckily, recompute_max_desc is only called in
three places: delete_write_fd, delete_keyboard_wait_descriptor and
deactivate_process.  The call in deactivate_process seems redundant as
it calls the other two anyway.  It seems easier to re-initialize the
struct when it becomes unused than to initialize it when it is used the
first time.

So I'm proposing the patch below that moves the re-initialization to one
single place, namely: recompute_max_desc.

Helmut

[0001-Re-initialize-fd_callback_data-fd-more-thoroughly.patch (text/x-diff, inline)]
From 675a76b297e9f050f719489a1419f0278ad8e9ae Mon Sep 17 00:00:00 2001
From: Helmut Eller <eller.helmut <at> gmail.com>
Date: Fri, 11 Aug 2023 10:31:24 +0200
Subject: [PATCH] Re-initialize fd_callback_data[fd] more thoroughly

The waiting_thread field was not re-initialized properly when a a
closed file descriptor resued by another process. (bug#65211)

As recompute_max_desc must be called when a file descriptor becomes
unused, we can do some re-initialization there.

* src/process.c (recompute_max_desc): Take the fd that was just
deleted as argument fd.  If the flags field is 0, reset the rest of
the struct too.
(delete_write_fd, delete_keyboard_wait_descriptor): Update callers
accordingly.
(delete_read_fd): This is now just an alias for
delete_keyboard_wait_descriptor.
(deactivate_process): Remove the call to recompute_max_desc, as
delete_read_fd is called anyway.
---
 src/process.c | 46 ++++++++++++++++------------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/src/process.c b/src/process.c
index 08cb810ec13..cd5fd97e9d6 100644
--- a/src/process.c
+++ b/src/process.c
@@ -507,13 +507,6 @@ void
 delete_read_fd (int fd)
 {
   delete_keyboard_wait_descriptor (fd);
-
-  eassert (0 <= fd && fd < FD_SETSIZE);
-  if (fd_callback_info[fd].flags == 0)
-    {
-      fd_callback_info[fd].func = 0;
-      fd_callback_info[fd].data = 0;
-    }
 }
 
 /* Add a file descriptor FD to be monitored for when write is possible.
@@ -544,19 +537,22 @@ add_non_blocking_write_fd (int fd)
 }
 
 static void
-recompute_max_desc (void)
+recompute_max_desc (int fd)
 {
-  int fd;
-
+  eassert (0 <= fd && fd < FD_SETSIZE);
   eassert (max_desc < FD_SETSIZE);
-  for (fd = max_desc; fd >= 0; --fd)
-    {
+
+  if (fd_callback_info[fd].flags == 0)
+    fd_callback_info[fd] = (struct fd_callback_data){ 0 };
+
+  if (fd == max_desc)
+    for (; fd >= 0; --fd)
       if (fd_callback_info[fd].flags != 0)
-	{
-	  max_desc = fd;
-	  break;
-	}
-    }
+        {
+          max_desc = fd;
+          break;
+        }
+
   eassert (max_desc < FD_SETSIZE);
 }
 
@@ -569,17 +565,10 @@ delete_write_fd (int fd)
   if ((fd_callback_info[fd].flags & NON_BLOCKING_CONNECT_FD) != 0)
     {
       if (--num_pending_connects < 0)
-	emacs_abort ();
+        emacs_abort ();
     }
   fd_callback_info[fd].flags &= ~(FOR_WRITE | NON_BLOCKING_CONNECT_FD);
-  if (fd_callback_info[fd].flags == 0)
-    {
-      fd_callback_info[fd].func = 0;
-      fd_callback_info[fd].data = 0;
-
-      if (fd == max_desc)
-	recompute_max_desc ();
-    }
+  recompute_max_desc (fd);
 }
 
 static void
@@ -4809,8 +4798,6 @@ deactivate_process (Lisp_Object proc)
       delete_read_fd (inchannel);
       if ((fd_callback_info[inchannel].flags & NON_BLOCKING_CONNECT_FD) != 0)
 	delete_write_fd (inchannel);
-      if (inchannel == max_desc)
-	recompute_max_desc ();
     }
 }
 
@@ -8134,8 +8121,7 @@ delete_keyboard_wait_descriptor (int desc)
 
   fd_callback_info[desc].flags &= ~(FOR_READ | KEYBOARD_FD | PROCESS_FD);
 
-  if (desc == max_desc)
-    recompute_max_desc ();
+  recompute_max_desc (desc);
 #endif
 }
 
-- 
2.39.2


Added tag(s) patch. Request was from Stefan Kangas <stefankangas <at> gmail.com> to control <at> debbugs.gnu.org. (Fri, 08 Sep 2023 17:25:02 GMT) Full text and rfc822 format available.

This bug report was last modified 1 year and 343 days ago.

Previous Next


GNU bug tracking system
Copyright (C) 1999 Darren O. Benham, 1997,2003 nCipher Corporation Ltd, 1994-97 Ian Jackson.