Package: diffutils;
Reported by: Dave Gordon <david.s.gordon <at> intel.com>
Date: Wed, 11 Feb 2015 16:51:01 UTC
Severity: normal
Message #5 received at submit <at> debbugs.gnu.org (full text, mbox):
From: Dave Gordon <david.s.gordon <at> intel.com> To: bug-diffutils <at> gnu.org Subject: RFC: diff: skip initial columns before comparing Date: Wed, 11 Feb 2015 14:33:21 +0000
When comparing certain types of files, notably timestamped logfiles such as the output of dmesg(1), it's necessary to ignore the initial characters on each line, otherwise every line is different. In the simplest case, this can be done by applying 'cut(1)' to each input; but then, important information about when the difference(s) occurred is lost, and it can be difficult to find the relevant lines in the original files, especially if they are highly repetitive (as logfiles often are). When is needed in this situation is to ignore the timestamps for purposes of comparison, but then include them in any lines copied to the output. So this patch adds a new option (long form only) "--ignore-initial=N" to ignore the first N characters of each line. This is done by skipping the first N characters of each line in find_and_hash_each_line(), and likewise lines_differ(). The hashing or comparison of the remaining part of the line then proceeds as usual. One subtle point: if both of the lines have less than N characters, the lines are considered equal iff they have the same length. Usually, the type of file you would use this option with will have a fixed-format prefix (which is the part to be ignored), and a line missing this prefix is generally an indication of a formatting error. So a line with the prefix but no further content should NOT match an empty line or a line with a truncated prefix; but we still want two empty lines to match each other. For example, with --ignore-initial=10: These lines match: [22:47:25] hello [23:17:24] hello These lines don't match: [22:47:25] hello [23:17:24] Nor do these: [22:47:] [23:17:24] But these do: [NOCLOCK] [CLKFAIL] Hope this looks useful! .Dave. ----------------------- diff --git a/src/diff.c b/src/diff.c index 50d0365..eccce21 100644 --- a/src/diff.c +++ b/src/diff.c @@ -121,6 +121,7 @@ enum NO_IGNORE_FILE_NAME_CASE_OPTION, NORMAL_OPTION, SDIFF_MERGE_ASSIST_OPTION, + SKIP_INITIAL_OPTION, STRIP_TRAILING_CR_OPTION, SUPPRESS_BLANK_EMPTY_OPTION, SUPPRESS_COMMON_LINES_OPTION, @@ -173,6 +174,7 @@ static struct option const longopts[] = {"ignore-blank-lines", 0, 0, 'B'}, {"ignore-case", 0, 0, 'i'}, {"ignore-file-name-case", 0, 0, IGNORE_FILE_NAME_CASE_OPTION}, + {"ignore-initial", 1, 0, SKIP_INITIAL_OPTION}, {"ignore-matching-lines", 1, 0, 'I'}, {"ignore-space-change", 0, 0, 'b'}, {"ignore-tab-expansion", 0, 0, 'E'}, @@ -580,6 +582,18 @@ main (int argc, char **argv) sdiff_merge_assist = true; break; + case SKIP_INITIAL_OPTION: + numval = strtoumax (optarg, &numend, 10); + if (! (0 < numval && numval <= SIZE_MAX) || *numend) + try_help ("invalid initial skip '%s'", optarg); + if (initial_skip != numval) + { + if (initial_skip) + fatal ("conflicting initial skip options"); + initial_skip = numval; + } + break; + case STRIP_TRAILING_CR_OPTION: strip_trailing_cr = true; break; @@ -724,7 +738,8 @@ main (int argc, char **argv) files_can_be_treated_as_binary = (brief & binary & ~ (ignore_blank_lines | ignore_case | strip_trailing_cr - | (ignore_regexp_list.regexps || ignore_white_space))); + | (ignore_regexp_list.regexps || ignore_white_space + || initial_skip))); switch_string = option_list (argv + 1, optind - 1); @@ -895,6 +910,7 @@ static char const * const option_help_msgid[] = { N_("-w, --ignore-all-space ignore all white space"), N_("-B, --ignore-blank-lines ignore changes where lines are all blank"), N_("-I, --ignore-matching-lines=RE ignore changes where all lines match RE"), + N_(" --ignore-initial=SKIP ignore the initial SKIP characters of each line"), "", N_("-a, --text treat all files as text"), N_(" --strip-trailing-cr strip trailing carriage return on input"), diff --git a/src/diff.h b/src/diff.h index e9f0471..b638a3f 100644 --- a/src/diff.h +++ b/src/diff.h @@ -125,6 +125,9 @@ XTERN enum DIFF_white_space ignore_white_space; /* Ignore changes that affect only blank lines (-B). */ +/* Skip this many initial characters on each line */ +XTERN size_t initial_skip; + /* Files can be compared byte-by-byte, as if they were binary. This depends on various options. */ XTERN bool files_can_be_treated_as_binary; diff --git a/src/io.c b/src/io.c index 463ee35..7e15996 100644 --- a/src/io.c +++ b/src/io.c @@ -232,13 +232,18 @@ find_and_hash_each_line (struct file_data *current) bool diff_length_compare_anyway = ig_white_space != IGNORE_NO_WHITE_SPACE; bool same_length_diff_contents_compare_anyway = - diff_length_compare_anyway | ig_case; + diff_length_compare_anyway | ig_case || initial_skip != 0; while (p < suffix_begin) { char const *ip = p; hash_value h = 0; unsigned char c; + size_t skip = initial_skip; + + while (skip--) + if ((c = *p++) == '\n') + goto hashing_done; /* Hash this line until we find a newline. */ switch (ig_white_space) diff --git a/src/util.c b/src/util.c index 016057d..0acba06 100644 --- a/src/util.c +++ b/src/util.c @@ -413,6 +413,16 @@ lines_differ (char const *s1, char const *s2) register char const *t1 = s1; register char const *t2 = s2; size_t column = 0; + size_t skip = initial_skip; + + while (skip--) + { + register unsigned char c1 = *t1++; + register unsigned char c2 = *t2++; + + if (c1 == '\n' || c2 == '\n') + return c1 != c2; + } while (1) {
GNU bug tracking system
Copyright (C) 1999 Darren O. Benham,
1997,2003 nCipher Corporation Ltd,
1994-97 Ian Jackson.