GNU bug report logs - #7960
multibyte: fmt: fix formatting multibyte text (bug #7372)

Previous Next

Package: coreutils;

Reported by: Kostya Stopani <hatta <at> depni.sinp.msu.ru>

Date: Wed, 2 Feb 2011 14:42:01 UTC

Severity: normal

Tags: moreinfo, patch

Done: Assaf Gordon <assafgordon <at> gmail.com>

Bug is archived. No further changes may be made.

To add a comment to this bug, you must first unarchive it, by sending
a message to control AT debbugs.gnu.org, with unarchive 7960 in the body.
You can then email your comments to 7960 AT debbugs.gnu.org in the normal way.

Toggle the display of automated, internal messages from the tracker.

View this report as an mbox folder, status mbox, maintainer mbox


Report forwarded to owner <at> debbugs.gnu.org, bug-coreutils <at> gnu.org:
bug#7960; Package coreutils. (Wed, 02 Feb 2011 14:42:01 GMT) Full text and rfc822 format available.

Acknowledgement sent to Kostya Stopani <hatta <at> depni.sinp.msu.ru>:
New bug report received and forwarded. Copy sent to bug-coreutils <at> gnu.org. (Wed, 02 Feb 2011 14:42:01 GMT) Full text and rfc822 format available.

Message #5 received at submit <at> debbugs.gnu.org (full text, mbox):

From: Kostya Stopani <hatta <at> depni.sinp.msu.ru>
To: bug-coreutils <at> gnu.org
Subject: [PATCH] fmt: fix formatting multibyte text (bug #7372)
Date: Wed, 2 Feb 2011 17:17:12 +0300
From b118695b7b614f5f0e371cad885a01306f527d9e Mon Sep 17 00:00:00 2001
From: Kostya Stopani <hatta <at> depni.sinp.msu.ru>
Date: Wed, 2 Feb 2011 17:10:05 +0300
Subject: [PATCH] fmt: fix formatting multibyte text (bug #7372)

* src/fmt.c (guess_screen_width): Add function to compute screen width
of a possibly multibyte word to correctly format international
text. If it's not multibyte fall back to byte length.

* src/fmt.c (mbsnrtowcs): Stub function partly implementing a GNU
extenstion function of the same name for non-GNU platforms.

* src/fmt.c (struct Word): Add a new field "nchar" to hold byte-length
of "text".

* src/fmt.c (get_line, check_punctuation, put_word): Use Word.length
as screen width of a word and Word.nchar as byte-length.
---
 src/fmt.c |   91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/src/fmt.c b/src/fmt.c
index 7d5aee3..1dcbaaf 100644
--- a/src/fmt.c
+++ b/src/fmt.c
@@ -20,6 +20,10 @@
 #include <stdio.h>
 #include <sys/types.h>
 #include <getopt.h>
+#include <wchar.h>
+#include <string.h>
+#include <errno.h>
+
 
 /* Redefine.  Otherwise, systems (Unicos for one) with headers that define
    it to be a type get syntax errors for the variable declaration below.  */
@@ -135,6 +139,7 @@ struct Word
 
     const char *text;		/* the text of the word */
     int length;			/* length of this word */
+    int nchar;			/* number of char entries in text array */
     int space;			/* the size of the following space */
     unsigned int paren:1;	/* starts with open paren */
     unsigned int period:1;	/* ends in [.?!])* */
@@ -167,6 +172,11 @@ static void put_paragraph (WORD *finish);
 static void put_line (WORD *w, int indent);
 static void put_word (WORD *w);
 static void put_space (int space);
+static unsigned int guess_screen_width (const char *text, size_t b);
+#ifndef __GNU_LIBRARY__
+static size_t mbsnrtowcs (wchar_t *DST, const char **SRC, size_t NMC, size_t LEN,
+			  mbstate_t *restrict PS);
+#endif	/* __GNU_LIBRARY__ */
 
 /* Option values.  */
 
@@ -670,7 +680,10 @@ get_line (FILE *f, int c)
           c = getc (f);
         }
       while (c != EOF && !isspace (c));
-      in_column += word_limit->length = wptr - word_limit->text;
+      word_limit->nchar = wptr - word_limit->text;
+      word_limit->length = guess_screen_width (word_limit->text, word_limit->nchar); 
+      in_column += word_limit->length;
+      
       check_punctuation (word_limit);
 
       /* Scan inter-word space.  */
@@ -751,7 +764,7 @@ static void
 check_punctuation (WORD *w)
 {
   char const *start = w->text;
-  char const *finish = start + (w->length - 1);
+  char const *finish = start + (w->nchar - 1);
   unsigned char fin = *finish;
 
   w->paren = isopen (*start);
@@ -982,7 +995,7 @@ put_word (WORD *w)
   int n;
 
   s = w->text;
-  for (n = w->length; n != 0; n--)
+  for (n = w->nchar; n != 0; n--)
     putchar (*s++);
   out_column += w->length;
 }
@@ -1011,3 +1024,75 @@ put_space (int space)
       out_column++;
     }
 }
+
+/* Try to convert text to multibyte and in this way determine its
+   screen width. Return number of bytes if conversion fails. */
+
+static unsigned int
+guess_screen_width (const char *text, size_t b)
+{
+  size_t c;
+  mbstate_t state;
+
+  memset (&state, 0, sizeof (state));
+
+  /* Try conversion */
+  c = mbsnrtowcs (NULL, &text, b, b, &state);
+  if (c > 0 && errno != EILSEQ)
+    return c;
+  else
+    return b;
+}
+
+
+#ifndef __GNU_LIBRARY__
+
+#define INITBUFSIZE 1024
+#define MAXBUFSIZE 1024*1024
+
+/* Stub mbsnrtowcs to be used when GNU extensions are unavailable. */
+
+size_t mbsnrtowcs (wchar_t *DST, const char **SRC, size_t NMC, size_t LEN,
+		      mbstate_t *restrict PS)
+{
+  static char *buf = NULL;
+  static size_t buf_size = INITBUFSIZE;
+  char *new_buf;
+  size_t new_buf_size, c;
+  mbstate_t state;
+
+  if (!buf)
+    {
+      buf = malloc (buf_size * sizeof (char));
+      if (!buf) return NMC;
+    }
+
+  memset (&state, 0, sizeof (state));
+
+  if (buf_size < NMC + 1)
+    {
+      /* Try to resize the buffer. */
+      new_buf_size = NMC + 1;
+      if (new_buf_size <= MAXBUFSIZE)
+	{
+	  new_buf = realloc (buf, new_buf_size * sizeof (char));
+	  if (!new_buf) return NMC;
+	  buf = new_buf;
+	  buf_size = new_buf_size;
+	}
+      else
+	{
+	  return NMC;
+	}
+    }
+
+  strncpy (buf, *SRC, NMC);
+  buf[NMC] = '\0';
+  new_buf = buf;
+  c = mbsrtowcs (NULL, (const char **restrict) &new_buf, NMC, &state);
+  if (c > 0 && errno != EILSEQ)
+    return c;
+  else
+    return NMC;
+}
+#endif	/* __GNU_LIBRARY__ */
-- 
1.7.0.4





Information forwarded to owner <at> debbugs.gnu.org, bug-coreutils <at> gnu.org:
bug#7960; Package coreutils. (Wed, 02 Feb 2011 17:08:01 GMT) Full text and rfc822 format available.

Message #8 received at 7960 <at> debbugs.gnu.org (full text, mbox):

From: Eric Blake <eblake <at> redhat.com>
To: Kostya Stopani <hatta <at> depni.sinp.msu.ru>
Cc: 7960 <at> debbugs.gnu.org
Subject: Re: bug#7960: [PATCH] fmt: fix formatting multibyte text (bug #7372)
Date: Wed, 02 Feb 2011 10:15:53 -0700
[Message part 1 (text/plain, inline)]
On 02/02/2011 07:17 AM, Kostya Stopani wrote:
>>From b118695b7b614f5f0e371cad885a01306f527d9e Mon Sep 17 00:00:00 2001
> From: Kostya Stopani <hatta <at> depni.sinp.msu.ru>
> Date: Wed, 2 Feb 2011 17:10:05 +0300
> Subject: [PATCH] fmt: fix formatting multibyte text (bug #7372)

Thanks for the patch.  However, it's not trivial, so it would need
copyright assignment.  Furthermore, there are already known issues where
upstream coreutils is lacking multibyte character support, but a
solution has to be both maintainable and no-impact to the single-byte
locale case.

> @@ -167,6 +172,11 @@ static void put_paragraph (WORD *finish);
>  static void put_line (WORD *w, int indent);
>  static void put_word (WORD *w);
>  static void put_space (int space);
> +static unsigned int guess_screen_width (const char *text, size_t b);
> +#ifndef __GNU_LIBRARY__
> +static size_t mbsnrtowcs (wchar_t *DST, const char **SRC, size_t NMC, size_t LEN,

Huh?  There's no need for either __GNU_LIBRARY__ nor for this
declaration; we can rely on gnulib to be providing mbsnrtowcs on all
platforms.

> +/* Stub mbsnrtowcs to be used when GNU extensions are unavailable. */
> +
> +size_t mbsnrtowcs (wchar_t *DST, const char **SRC, size_t NMC, size_t LEN,

Therefore, this function has no place in fmt.c.

-- 
Eric Blake   eblake <at> redhat.com    +1-801-349-2682
Libvirt virtualization library http://libvirt.org

[signature.asc (application/pgp-signature, attachment)]

Information forwarded to owner <at> debbugs.gnu.org, bug-coreutils <at> gnu.org:
bug#7960; Package coreutils. (Wed, 02 Feb 2011 21:26:02 GMT) Full text and rfc822 format available.

Message #11 received at 7960 <at> debbugs.gnu.org (full text, mbox):

From: Eric Blake <eblake <at> redhat.com>
To: Kostya Stopani <hatta <at> depni.sinp.msu.ru>, 7960 <at> debbugs.gnu.org
Subject: Re: bug#7960: [PATCH] fmt: fix formatting multibyte text (bug #7372)
Date: Wed, 02 Feb 2011 14:33:44 -0700
[Message part 1 (text/plain, inline)]
[readding the list]

On 02/02/2011 02:11 PM, Kostya Stopani wrote:
> On Wed, Feb 02, 2011 at 10:15:53AM -0700, Eric Blake wrote:
> 
>> Thanks for the patch.  However, it's not trivial, so it would need
>> copyright assignment.
> 
> Oh boy... Anyway I don't mind signing papers, if you (or whoever)
> don't mind bothering with it.

OK, I'll send you those details off-list.

> 
>> Furthermore, there are already known issues where upstream coreutils
>> is lacking multibyte character support, but a solution has to be
>> both maintainable and no-impact to the single-byte locale case.
> 
> I believe this patch doesn't break single-byte behavior because no
> conversion takes place. mbsnrtowcs() is used only to count
> characters. I've tested various cases (8-bit encoding was KOI8-R):
> 
> |--------+---------------+--------------------------|
> | Locale | Text encoding | Result                   |
> |--------+---------------+--------------------------|
> | UTF-8  | UTF-8         | old fmt: text too narrow |
> |        |               | new fmt: ok              |
> |--------+---------------+--------------------------|
> | UTF-8  | 8-bit         | same                     |
> |--------+---------------+--------------------------|
> | 8-bit  | UTF-8         | same                     |
> |--------+---------------+--------------------------|
> | 8-bit  | 8-bit         | same                     |
> |--------+---------------+--------------------------|
> 
> From my point of view the alternative is to convert everything to
> wchar_t, which imposes the need to keep track of conversion errors and
> gracefully fall back to single-byte.

Keeping things in multibyte rather than converting to wchar_t is the way
to go (especially given the ongoing discussion of how to handle the fact
that on cygwin, wchar_t is UTF-16 and thus still multi-unit as an
extension to POSIX, with all sorts of ramifications to programs that
expect POSIX semantics).

-- 
Eric Blake   eblake <at> redhat.com    +1-801-349-2682
Libvirt virtualization library http://libvirt.org

[signature.asc (application/pgp-signature, attachment)]

Information forwarded to owner <at> debbugs.gnu.org, bug-coreutils <at> gnu.org:
bug#7960; Package coreutils. (Sun, 17 Apr 2011 09:33:01 GMT) Full text and rfc822 format available.

Message #14 received at 7960 <at> debbugs.gnu.org (full text, mbox):

From: Jim Meyering <jim <at> meyering.net>
To: 7960 <at> debbugs.gnu.org
Subject: Re: bug#7960: [PATCH] fmt: fix formatting multibyte text (bug #7372)
Date: Sun, 17 Apr 2011 11:32:49 +0200
tags 7960 + moreinfo
thanks

Eric Blake wrote:
> [readding the list]
>
> On 02/02/2011 02:11 PM, Kostya Stopani wrote:
>> On Wed, Feb 02, 2011 at 10:15:53AM -0700, Eric Blake wrote:
>>
>>> Thanks for the patch.  However, it's not trivial, so it would need
>>> copyright assignment.
>>
>> Oh boy... Anyway I don't mind signing papers, if you (or whoever)
>> don't mind bothering with it.
>
> OK, I'll send you those details off-list.

Marked as "moreinfo" since now we're waiting for
copyright assignment paperwork.




Added tag(s) moreinfo. Request was from Jim Meyering <jim <at> meyering.net> to control <at> debbugs.gnu.org. (Sun, 17 Apr 2011 11:26:01 GMT) Full text and rfc822 format available.

Information forwarded to bug-coreutils <at> gnu.org:
bug#7960; Package coreutils. (Tue, 23 Oct 2018 03:09:02 GMT) Full text and rfc822 format available.

Message #19 received at 7960 <at> debbugs.gnu.org (full text, mbox):

From: Assaf Gordon <assafgordon <at> gmail.com>
To: 7960 <at> debbugs.gnu.org
Subject: Re: bug#7960: [PATCH] fmt: fix formatting multibyte text (bug #7372)
Date: Mon, 22 Oct 2018 21:08:34 -0600
retitle 7960 multibyte: fmt: fix formatting multibyte text (bug #7372)
close 7960
stop

(triaging old bugs)

On 17/04/11 03:32 AM, Jim Meyering wrote:
> 
> Eric Blake wrote:
>> [readding the list]
>>
>> On 02/02/2011 02:11 PM, Kostya Stopani wrote:
>>> On Wed, Feb 02, 2011 at 10:15:53AM -0700, Eric Blake wrote:
>>>
>>>> Thanks for the patch.  However, it's not trivial, so it would need
>>>> copyright assignment.
>>>
>>> Oh boy... Anyway I don't mind signing papers, if you (or whoever)
>>> don't mind bothering with it.
>>
>> OK, I'll send you those details off-list.
> 
> Marked as "moreinfo" since now we're waiting for
> copyright assignment paperwork.

With no further follow-ups in 7 years
(and the original author's name not in the copyright.list file),
I'm closing this bug.

If there are new developments, we can always re-open it.

regards,
 -assaf





Changed bug title to 'multibyte: fmt: fix formatting multibyte text (bug #7372)' from '[PATCH] fmt: fix formatting multibyte text (bug #7372)' Request was from Assaf Gordon <assafgordon <at> gmail.com> to control <at> debbugs.gnu.org. (Tue, 23 Oct 2018 03:09:04 GMT) Full text and rfc822 format available.

bug closed, send any further explanations to 7960 <at> debbugs.gnu.org and Kostya Stopani <hatta <at> depni.sinp.msu.ru> Request was from Assaf Gordon <assafgordon <at> gmail.com> to control <at> debbugs.gnu.org. (Tue, 23 Oct 2018 03:09:04 GMT) Full text and rfc822 format available.

bug archived. Request was from Debbugs Internal Request <help-debbugs <at> gnu.org> to internal_control <at> debbugs.gnu.org. (Tue, 20 Nov 2018 12:24:08 GMT) Full text and rfc822 format available.

This bug report was last modified 6 years and 264 days ago.

Previous Next


GNU bug tracking system
Copyright (C) 1999 Darren O. Benham, 1997,2003 nCipher Corporation Ltd, 1994-97 Ian Jackson.