GNU bug report logs - #13127
[PATCH] cut: use only one data strucutre

Previous Next

Package: coreutils;

Reported by: xojoc <at> gmx.com

Date: Sun, 9 Dec 2012 10:29:01 UTC

Severity: normal

Tags: patch

Done: Pádraig Brady <P <at> draigBrady.com>

Bug is archived. No further changes may be made.

Full log


View this message in rfc822 format

From: help-debbugs <at> gnu.org (GNU bug Tracking System)
To: Pádraig Brady <P <at> draigBrady.com>
Cc: tracker <at> debbugs.gnu.org
Subject: bug#13127: closed ([PATCH] cut: use only one data strucutre)
Date: Fri, 26 Apr 2013 23:47:02 +0000
[Message part 1 (text/plain, inline)]
Your message dated Sat, 27 Apr 2013 00:46:43 +0100
with message-id <517B11E3.8030904 <at> draigBrady.com>
and subject line Re: bug#13127: [PATCH] cut: use only one data strucutre
has caused the debbugs.gnu.org bug report #13127,
regarding [PATCH] cut: use only one data strucutre
to be marked as done.

(If you believe you have received this mail in error, please contact
help-debbugs <at> gnu.org.)


-- 
13127: http://debbugs.gnu.org/cgi/bugreport.cgi?bug=13127
GNU Bug Tracking System
Contact help-debbugs <at> gnu.org with problems
[Message part 2 (message/rfc822, inline)]
From: Cojocaru Alexandru <xojoc <at> gmx.com>
To: bug-coreutils <at> gnu.org
Subject: [PATCH] cut: use only one data strucutre
Date: Sun, 9 Dec 2012 11:28:05 +0100
[Message part 3 (text/plain, inline)]
From 678c2ecfebbf7a278c14b7e6fcb815e87569cd20 Mon Sep 17 00:00:00 2001
From: Cojocaru Alexandru <xojoc <at> gmx.com>
Date: Sun, 9 Dec 2012 10:43:10 +0100
Subject: [PATCH] cut: use only one data structure

The current implementation of cut, uses a bit array,
an array of `struct range_pair's, and (when --output-delimiter
is specified) a hash_table. The new implementation will use
only an array of `struct range_pair's.
The old implementation is inefficient for the following reasons:
 1. When -b with a big num is specified, it allocates a lot of useless
    memory for `printable_field'.
 2. When --output-delimiter is specified, it will allocate 31 buckets.
    Even if a few ranges are specified.

* src/cut.c (set_fields): Set and initialize RP
instead of printable_field.
* src/cut.c (print_kth): Split it. Check *only* if a given field
or byte is printable.
* src/cut.c (is_range_start): New function.
* tests/misc/cut.pl: Check if `eol_range_start' is set correctly.
---
 src/cut.c         | 312 +++++++++++++++++++-----------------------------------
 tests/misc/cut.pl |   4 +
 2 files changed, 113 insertions(+), 203 deletions(-)

diff --git a/src/cut.c b/src/cut.c
index de9320c..545639d 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -53,8 +53,31 @@
     }									\
   while (0)
 
+
+struct range_pair
+  {
+    size_t lo;
+    size_t hi;
+  };
+
+/* Array of `struct range_pair' holding all the finite ranges. */
+static struct range_pair *rp;
+
+/* Pointer inside RP. When checking if a byte or field is selected
+   by a finite range, we check if it is between CURRENT_RP.LO
+   and CURRENT_RP.HI. If the byte or field index is greater than
+   CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
+static struct range_pair *current_rp;
+
+/* Number of finite ranges specified by the user. */
+static size_t n_rp;
+
+/* Number of `struct range_pair's allocated. */
+static size_t n_rp_allocated;
+
+
 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
-   space if necessary.  Update local variable N_RP.  When allocating,
+   space if necessary.  Update global variable N_RP.  When allocating,
    update global variable N_RP_ALLOCATED.  */
 
 #define ADD_RANGE_PAIR(rp, low, high)			\
@@ -72,11 +95,6 @@
     }							\
   while (0)
 
-struct range_pair
-  {
-    size_t lo;
-    size_t hi;
-  };
 
 /* This buffer is used to support the semantics of the -s option
    (or lack of same) when the specified field list includes (does
@@ -90,26 +108,11 @@ static char *field_1_buffer;
 /* The number of bytes allocated for FIELD_1_BUFFER.  */
 static size_t field_1_bufsize;
 
-/* The largest field or byte index used as an endpoint of a closed
-   or degenerate range specification;  this doesn't include the starting
-   index of right-open-ended ranges.  For example, with either range spec
-   '2-5,9-', '2-3,5,9-' this variable would be set to 5.  */
-static size_t max_range_endpoint;
 
 /* If nonzero, this is the index of the first field in a range that goes
    to end of line. */
 static size_t eol_range_start;
 
-/* This is a bit vector.
-   In byte mode, which bytes to output.
-   In field mode, which DELIM-separated fields to output.
-   Both bytes and fields are numbered starting with 1,
-   so the zeroth bit of this array is unused.
-   A field or byte K has been selected if
-   (K <= MAX_RANGE_ENDPOINT and is_printable_field(K))
-    || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START).  */
-static unsigned char *printable_field;
-
 enum operating_mode
   {
     undefined_mode,
@@ -148,15 +151,6 @@ static char *output_delimiter_string;
 /* True if we have ever read standard input. */
 static bool have_read_stdin;
 
-#define HT_RANGE_START_INDEX_INITIAL_CAPACITY 31
-
-/* The set of range-start indices.  For example, given a range-spec list like
-   '-b1,3-5,4-9,15-', the following indices will be recorded here: 1, 3, 15.
-   Note that although '4' looks like a range-start index, it is in the middle
-   of the '3-5' range, so it doesn't count.
-   This table is created/used IFF output_delimiter_specified is set.  */
-static Hash_table *range_start_ht;
-
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
@@ -240,73 +234,33 @@ With no FILE, or when FILE is -, read standard input.\n\
   exit (status);
 }
 
-static inline void
-mark_range_start (size_t i)
-{
-  /* Record the fact that 'i' is a range-start index.  */
-  void *ent_from_table = hash_insert (range_start_ht, (void*) i);
-  if (ent_from_table == NULL)
-    {
-      /* Insertion failed due to lack of memory.  */
-      xalloc_die ();
-    }
-  assert ((size_t) ent_from_table == i);
-}
-
-static inline void
-mark_printable_field (size_t i)
-{
-  size_t n = i / CHAR_BIT;
-  printable_field[n] |= (1 << (i % CHAR_BIT));
-}
-
-static inline bool
-is_printable_field (size_t i)
-{
-  size_t n = i / CHAR_BIT;
-  return (printable_field[n] >> (i % CHAR_BIT)) & 1;
-}
-
-static size_t
-hash_int (const void *x, size_t tablesize)
-{
-#ifdef UINTPTR_MAX
-  uintptr_t y = (uintptr_t) x;
-#else
-  size_t y = (size_t) x;
-#endif
-  return y % tablesize;
-}
+/* Return nonzero if the K'th field or byte is printable. */
 
 static bool
-hash_compare_ints (void const *x, void const *y)
+print_kth (size_t k)
 {
-  return (x == y) ? true : false;
-}
+  bool k_selected = false;
+  if (0 < eol_range_start && eol_range_start <= k)
+    k_selected = true;
+  else if (current_rp->lo <= k && k <= current_rp->hi)
+    k_selected = true;
 
-static bool
-is_range_start_index (size_t i)
-{
-  return hash_lookup (range_start_ht, (void *) i) ? true : false;
+  return k_selected ^ complement;
 }
 
-/* Return nonzero if the K'th field or byte is printable.
-   When returning nonzero, if RANGE_START is non-NULL,
-   set *RANGE_START to true if K is the beginning of a range, and to
-   false otherwise.  */
+/* Return nonzero if K'th byte is the beginning of a range. */
 
-static bool
-print_kth (size_t k, bool *range_start)
+static inline bool
+is_range_start (size_t k)
 {
-  bool k_selected
-    = ((0 < eol_range_start && eol_range_start <= k)
-       || (k <= max_range_endpoint && is_printable_field (k)));
+  bool is_start = false;
 
-  bool is_selected = k_selected ^ complement;
-  if (range_start && is_selected)
-    *range_start = is_range_start_index (k);
+  if (!complement)
+    is_start = (k == eol_range_start || k == current_rp->lo);
+  else
+    is_start = (k == (current_rp - 1)->hi + 1);
 
-  return is_selected;
+  return is_start;
 }
 
 /* Comparison function for qsort to order the list of
@@ -319,24 +273,14 @@ compare_ranges (const void *a, const void *b)
   return a_start < b_start ? -1 : a_start > b_start;
 }
 
-/* Given the list of field or byte range specifications FIELDSTR, set
-   MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
-   array.  If there is a right-open-ended range, set EOL_RANGE_START
-   to its starting index.  FIELDSTR should be composed of one or more
-   numbers or ranges of numbers, separated by blanks or commas.
-   Incomplete ranges may be given: '-m' means '1-m'; 'n-' means 'n'
-   through end of line.  Return true if FIELDSTR contains at least
-   one field specification, false otherwise.  */
-
-/* FIXME-someday:  What if the user wants to cut out the 1,000,000-th
-   field of some huge input file?  This function shouldn't have to
-   allocate a table of a million bits just so we can test every
-   field < 10^6 with an array dereference.  Instead, consider using
-   an adaptive approach: if the range of selected fields is too large,
-   but only a few fields/byte-offsets are actually selected, use a
-   hash table.  If the range of selected fields is too large, and
-   too many are selected, then resort to using the range-pairs (the
-   'rp' array) directly.  */
+/* Given the list of field or byte range specifications FIELDSTR,
+   allocate and initialize the RP array. If there is a right-open-ended
+   range, set EOL_RANGE_START to its starting index. FIELDSTR should
+   be composed of one or more numbers or ranges of numbers, separated
+   by blanks or commas. Incomplete ranges may be given: '-m' means '1-m';
+   'n-' means 'n' through end of line.
+   Return true if FIELDSTR contains at least one field specification,
+   false otherwise.  */
 
 static bool
 set_fields (const char *fieldstr)
@@ -349,9 +293,6 @@ set_fields (const char *fieldstr)
   bool field_found = false;	/* True if at least one field spec
                                    has been processed.  */
 
-  struct range_pair *rp = NULL;
-  size_t n_rp = 0;
-  size_t n_rp_allocated = 0;
   size_t i;
   bool in_digits = false;
 
@@ -403,41 +344,10 @@ set_fields (const char *fieldstr)
                   if (value < initial)
                     FATAL_ERROR (_("invalid decreasing range"));
 
-                  /* Is there already a range going to end of line? */
-                  if (eol_range_start != 0)
-                    {
-                      /* Yes.  Is the new sequence already contained
-                         in the old one?  If so, no processing is
-                         necessary. */
-                      if (initial < eol_range_start)
-                        {
-                          /* No, the new sequence starts before the
-                             old.  Does the old range going to end of line
-                             extend into the new range?  */
-                          if (eol_range_start <= value)
-                            {
-                              /* Yes.  Simply move the end of line marker. */
-                              eol_range_start = initial;
-                            }
-                          else
-                            {
-                              /* No.  A simple range, before and disjoint from
-                                 the range going to end of line.  Fill it. */
-                              ADD_RANGE_PAIR (rp, initial, value);
-                            }
-
-                          /* In any case, some fields were selected. */
-                          field_found = true;
-                        }
-                    }
-                  else
-                    {
-                      /* There is no range going to end of line. */
-                      ADD_RANGE_PAIR (rp, initial, value);
-                      field_found = true;
-                    }
-                  value = 0;
+                  ADD_RANGE_PAIR (rp, initial, value);
+                  field_found = true;
                 }
+              value = 0;
             }
           else
             {
@@ -448,9 +358,7 @@ set_fields (const char *fieldstr)
             }
 
           if (*fieldstr == '\0')
-            {
-              break;
-            }
+            break;
 
           fieldstr++;
           lhs_specified = false;
@@ -494,47 +402,43 @@ set_fields (const char *fieldstr)
         FATAL_ERROR (_("invalid byte, character or field list"));
     }
 
-  max_range_endpoint = 0;
-  for (i = 0; i < n_rp; i++)
+  qsort (rp, n_rp, sizeof (rp[0]), compare_ranges);
+
+  /* Omit finite ranges subsumed by a to-EOL range. */
+  if (eol_range_start && n_rp)
     {
-      if (rp[i].hi > max_range_endpoint)
-        max_range_endpoint = rp[i].hi;
+      i = n_rp;
+      while (i && eol_range_start <= rp[i - 1].hi)
+        {
+          eol_range_start = MIN (rp[i - 1].lo, eol_range_start);
+          --n_rp;
+          --i;
+        }
     }
 
-  /* Allocate an array large enough so that it may be indexed by
-     the field numbers corresponding to all finite ranges
-     (i.e. '2-6' or '-4', but not '5-') in FIELDSTR.  */
-
-  if (max_range_endpoint)
-    printable_field = xzalloc (max_range_endpoint / CHAR_BIT + 1);
-
-  qsort (rp, n_rp, sizeof (rp[0]), compare_ranges);
-
-  /* Set the array entries corresponding to integers in the ranges of RP.  */
-  for (i = 0; i < n_rp; i++)
+  /* Merge finite range pairs (e.g. `2-5,3-4' becomes `2-5'). */
+  for (i = 0; i < n_rp; ++i)
     {
-      /* Ignore any range that is subsumed by the to-EOL range.  */
-      if (eol_range_start && eol_range_start <= rp[i].lo)
-        continue;
-
-      /* Record the range-start indices, i.e., record each start
-         index that is not part of any other (lo..hi] range.  */
-      size_t rsi_candidate = complement ? rp[i].hi + 1 : rp[i].lo;
-      if (output_delimiter_specified
-          && !is_printable_field (rsi_candidate))
-        mark_range_start (rsi_candidate);
-
-      for (size_t j = rp[i].lo; j <= rp[i].hi; j++)
-        mark_printable_field (j);
+      for (size_t j = i + 1; j < n_rp; ++j)
+        {
+          if (rp[j].lo <= rp[i].hi)
+            {
+              rp[i].hi = MAX (rp[j].hi, rp[i].hi);
+              memmove (rp + j, rp + j + 1,
+                       (n_rp - j - 1) * sizeof (struct range_pair));
+              --n_rp;
+            }
+          else
+            break;
+        }
     }
 
-  if (output_delimiter_specified
-      && !complement
-      && eol_range_start
-      && max_range_endpoint && !is_printable_field (eol_range_start))
-    mark_range_start (eol_range_start);
 
-  free (rp);
+  /* After merging, reallocate RP so we realise memory to the system.
+     Also add a sentinel at the end of RP, so we never get memory segfault. */
+  ++n_rp;
+  rp = xrealloc (rp, n_rp * sizeof (struct range_pair));
+  rp[n_rp - 1].lo = rp[n_rp - 1].hi = 0;
 
   return field_found;
 }
@@ -551,7 +455,8 @@ cut_bytes (FILE *stream)
 
   byte_idx = 0;
   print_delimiter = false;
-  while (1)
+  current_rp = rp;
+  while (true)
     {
       int c;		/* Each character from the file. */
 
@@ -562,6 +467,7 @@ cut_bytes (FILE *stream)
           putchar ('\n');
           byte_idx = 0;
           print_delimiter = false;
+          current_rp = rp;
         }
       else if (c == EOF)
         {
@@ -571,16 +477,21 @@ cut_bytes (FILE *stream)
         }
       else
         {
-          bool range_start;
-          bool *rs = output_delimiter_specified ? &range_start : NULL;
-          if (print_kth (++byte_idx, rs))
+          ++byte_idx;
+          if ((current_rp->hi < byte_idx) && (current_rp < rp + n_rp - 1))
+            ++current_rp;
+          if (print_kth (byte_idx))
             {
-              if (rs && *rs && print_delimiter)
+              if (output_delimiter_specified)
                 {
-                  fwrite (output_delimiter_string, sizeof (char),
-                          output_delimiter_length, stdout);
+                  if (print_delimiter && is_range_start (byte_idx))
+                    {
+                      fwrite (output_delimiter_string, sizeof (char),
+                              output_delimiter_length, stdout);
+                    }
+                  print_delimiter = true;
                 }
-              print_delimiter = true;
+
               putchar (c);
             }
         }
@@ -597,6 +508,8 @@ cut_fields (FILE *stream)
   bool found_any_selected_field = false;
   bool buffer_first_field;
 
+  current_rp = rp;
+
   c = getc (stream);
   if (c == EOF)
     return;
@@ -609,7 +522,7 @@ cut_fields (FILE *stream)
      and the first field has been selected, or if non-delimited lines
      must be suppressed and the first field has *not* been selected.
      That is because a non-delimited line has exactly one field.  */
-  buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL));
+  buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
 
   while (1)
     {
@@ -650,7 +563,7 @@ cut_fields (FILE *stream)
                 }
               continue;
             }
-          if (print_kth (1, NULL))
+          if (print_kth (1))
             {
               /* Print the field, but not the trailing delimiter.  */
               fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout);
@@ -661,7 +574,7 @@ cut_fields (FILE *stream)
 
       if (c != EOF)
         {
-          if (print_kth (field_idx, NULL))
+          if (print_kth (field_idx))
             {
               if (found_any_selected_field)
                 {
@@ -695,7 +608,11 @@ cut_fields (FILE *stream)
         }
 
       if (c == delim)
-        ++field_idx;
+        {
+          ++field_idx;
+          if ((field_idx > current_rp->hi) && (current_rp < rp + n_rp - 1))
+            ++current_rp;
+        }
       else if (c == '\n' || c == EOF)
         {
           if (found_any_selected_field
@@ -704,6 +621,7 @@ cut_fields (FILE *stream)
           if (c == EOF)
             break;
           field_idx = 1;
+          current_rp = rp;
           found_any_selected_field = false;
         }
     }
@@ -854,16 +772,6 @@ main (int argc, char **argv)
     FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
 \tonly when operating on fields"));
 
-  if (output_delimiter_specified)
-    {
-      range_start_ht = hash_initialize (HT_RANGE_START_INDEX_INITIAL_CAPACITY,
-                                        NULL, hash_int,
-                                        hash_compare_ints, NULL);
-      if (range_start_ht == NULL)
-        xalloc_die ();
-
-    }
-
   if (! set_fields (spec_list_string))
     {
       if (operating_mode == field_mode)
@@ -890,8 +798,6 @@ main (int argc, char **argv)
     for (ok = true; optind < argc; optind++)
       ok &= cut_file (argv[optind]);
 
-  if (range_start_ht)
-    hash_free (range_start_ht);
 
   if (have_read_stdin && fclose (stdin) == EOF)
     {
diff --git a/tests/misc/cut.pl b/tests/misc/cut.pl
index 0f0a3a3..120880c 100755
--- a/tests/misc/cut.pl
+++ b/tests/misc/cut.pl
@@ -182,6 +182,10 @@ my @Tests =
                                          {IN=>"123456\n"}, {OUT=>"23456\n"}],
   ['EOL-subsumed-3', '--complement -b3,4-4,5,2-',
                                          {IN=>"123456\n"}, {OUT=>"1\n"}],
+
+  ['EOL-subsumed-4', '--output-d=: -b1-2,2-3,3-',
+                                        {IN=>"1234\n"}, {OUT=>"1234\n"}],
+
  );
 
 if ($mb_locale ne 'C')
-- 
1.8.0.1


Best regards,
Cojocaru Alexandru
[DIFF-new-impl (application/octet-stream, attachment)]
[Message part 5 (message/rfc822, inline)]
From: Pádraig Brady <P <at> draigBrady.com>
To: Alexandru Cojocaru <xojoc <at> gmx.com>
Cc: 13127-done <at> debbugs.gnu.org
Subject: Re: bug#13127: [PATCH] cut: use only one data strucutre
Date: Sat, 27 Apr 2013 00:46:43 +0100
On 04/26/2013 11:49 PM, Alexandru Cojocaru wrote:

>>>> The cut.pl test is now failing on master. Could you have a look.
>>> I had no problems. Could you show me your output?
>>
>> Ah the failures are in tests I added in the meantime:
>> http://git.sv.gnu.org/gitweb/?p=coreutils.git;a=commit;h=51ce0bf8
>>
>> Specifically this is now only outputting the first field,
>> rather than both fields like it should:
>>
>> printf '%s\n' a:1 b:2 | src/cut -s -d: -f1,2
> The problem was caused by `current_rp' which wasn't
> incremented as needed. See attachment for patch.
> My tests were succesfull, can you recheck?

Great tests now pass here.
I'll give it a thorough review tomorrow and apply.

thanks,
Pádraig.



This bug report was last modified 12 years and 77 days ago.

Previous Next


GNU bug tracking system
Copyright (C) 1999 Darren O. Benham, 1997,2003 nCipher Corporation Ltd, 1994-97 Ian Jackson.