GNU bug report logs - #7489
[coreutils] over aggressive threads in sort

Previous Next

Package: coreutils;

Reported by: DJ Lucas <dj <at> linuxfromscratch.org>

Date: Fri, 26 Nov 2010 19:40:02 UTC

Severity: normal

Tags: fixed

Done: Assaf Gordon <assafgordon <at> gmail.com>

Bug is archived. No further changes may be made.

Full log


View this message in rfc822 format

From: Jim Meyering <jim <at> meyering.net>
To: Chen Guo <chen.guo.0625 <at> gmail.com>
Cc: Paul Eggert <eggert <at> cs.ucla.edu>, DJ Lucas <dj <at> linuxfromscratch.org>, 7489 <at> debbugs.gnu.org, coreutils <at> gnu.org
Subject: bug#7489: [coreutils] over aggressive threads in sort
Date: Tue, 07 Dec 2010 11:06:53 +0100
Chen Guo wrote:
...
>     I've attached the patch (inlined at the bottom). Here's the GDB
> crash, with backtrace. I also printed node->queued in GDB, so it's
> evident that its accessible.
>
> (gdb) run --parallel 2 rec_1M > /dev/null
> Starting program: /data/chen/Coding/Coreutils/test/sort-mutex
> --parallel 2 rec_1M > /dev/null
> [Thread debugging using libthread_db enabled]
> [New Thread 0x7fffcbb95710 (LWP 19850)]
>
> Program received signal SIGSEGV, Segmentation fault.
> queue_insert (queue=0x7fffffffe0c0, node=0x7ffff7ddc560) at sort.c:3202
> 3202	  node->queued = true;
> (gdb) bt
...
> (gdb) print node
> $1 = (struct merge_node *) 0x7ffff7ddc560
> (gdb) print node->queued
> $2 = false

Could it be that you're looking at one thread,
while the segfault happened in another?
In my core dump, the offending "node" pointer had a value of 5.

...
> And here's the patch:
>
> Subject: [PATCH] sort: change spinlocks to mutexes.
>
> * src/sort.c: (struct merge_node) Change member lock to mutex. All
> uses changed.

Hi Chen,

Thanks for the patch.
Since this fixes a bug, I've added a NEWS entry.
Also, since with your patch, the sort-spinlock-abuse
test now passes, I've adjusted tests/Makefile.am to reflect that.
The segfault may be easier to reproduce with mutexes,
but I've seen the same one also with spinlocks (albeit rarely).

Here's the amended patch:

From 7a80bc63e1411f0a2ed7c4259e852de34591a65a Mon Sep 17 00:00:00 2001
From: Chen Guo <chenguo4 <at> ucla.edu>
Date: Mon, 6 Dec 2010 00:15:42 -0800
Subject: [PATCH] sort: use mutexes, not spinlocks (avoid busy loop on blocked output)

Running a command like this on a multi-core system
  sort < big-file | less
would peg all processors at near 100% utilization.
* src/sort.c: (struct merge_node) Change member lock to mutex.
All uses changed.
* tests/Makefile.am (XFAIL_TESTS): Remove definition, now that
this test passes once again.  I.e., the sort-spinlock-abuse test
no longer fails.
* NEWS (Bug reports): Mention this.
Reported by DJ Lucas in http://debbugs.gnu.org/7489.
---
 NEWS              |    5 +++++
 src/sort.c        |   14 +++++++-------
 tests/Makefile.am |    3 ---
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/NEWS b/NEWS
index c3110a3..9f55cbb 100644
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,11 @@ GNU coreutils NEWS                                    -*- outline -*-
   sort -u with at least two threads could attempt to read through a
   corrupted pointer. [bug introduced in coreutils-8.6]

+  sort with at least two threads and with blocked output would busy-loop
+  (spinlock) all threads, often using 100% of available CPU cycles to
+  do no work.  I.e., "sort < big-file | less" could waste a lot of power.
+  [bug introduced in coreutils-8.6]
+
 ** New features

   split accepts the --number option to generate a specific number of files.
diff --git a/src/sort.c b/src/sort.c
index a4c2cbf..9cfc814 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -241,7 +241,7 @@ struct merge_node
   struct merge_node *parent;    /* Parent node. */
   unsigned int level;           /* Level in merge tree. */
   bool queued;                  /* Node is already in heap. */
-  pthread_spinlock_t lock;      /* Lock for node operations. */
+  pthread_mutex_t lock;         /* Lock for node operations. */
 };

 /* Priority queue of merge nodes. */
@@ -3157,7 +3157,7 @@ compare_nodes (void const *a, void const *b)
 static inline void
 lock_node (struct merge_node *node)
 {
-  pthread_spin_lock (&node->lock);
+  pthread_mutex_lock (&node->lock);
 }

 /* Unlock a merge tree NODE. */
@@ -3165,7 +3165,7 @@ lock_node (struct merge_node *node)
 static inline void
 unlock_node (struct merge_node *node)
 {
-  pthread_spin_unlock (&node->lock);
+  pthread_mutex_unlock (&node->lock);
 }

 /* Destroy merge QUEUE. */
@@ -3479,7 +3479,7 @@ sortlines (struct line *restrict lines, struct line *restrict dest,
   node.parent = parent;
   node.level = parent->level + 1;
   node.queued = false;
-  pthread_spin_init (&node.lock, PTHREAD_PROCESS_PRIVATE);
+  pthread_mutex_init (&node.lock, NULL);

   /* Calculate thread arguments. */
   unsigned long int lo_threads = nthreads / 2;
@@ -3515,7 +3515,7 @@ sortlines (struct line *restrict lines, struct line *restrict dest,
       merge_loop (queue, total_lines, tfp, temp_output);
     }

-  pthread_spin_destroy (&node.lock);
+  pthread_mutex_destroy (&node.lock);
 }

 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
@@ -3799,12 +3799,12 @@ sort (char * const *files, size_t nfiles, char const *output_file,
               node.parent = NULL;
               node.level = MERGE_END;
               node.queued = false;
-              pthread_spin_init (&node.lock, PTHREAD_PROCESS_PRIVATE);
+              pthread_mutex_init (&node.lock, NULL);

               sortlines (line, line, nthreads, buf.nlines, &node, true,
                          &queue, tfp, temp_output);
               queue_destroy (&queue);
-              pthread_spin_destroy (&node.lock);
+              pthread_mutex_destroy (&node.lock);
             }
           else
             write_unique (line - 1, tfp, temp_output);
diff --git a/tests/Makefile.am b/tests/Makefile.am
index d52f677..b573061 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -656,7 +656,4 @@ pr_data =					\
   pr/ttb3-FF					\
   pr/w72l24f-ll

-XFAIL_TESTS =					\
-  misc/sort-spinlock-abuse
-
 include $(srcdir)/check.mk
--
1.7.3.2.92.g7e4eb




This bug report was last modified 6 years and 202 days ago.

Previous Next


GNU bug tracking system
Copyright (C) 1999 Darren O. Benham, 1997,2003 nCipher Corporation Ltd, 1994-97 Ian Jackson.