001// License: GPL. For details, see LICENSE file.
002package org.openstreetmap.josm.tools;
003
004//// Taken from http://www.bmsi.com/java/#diff
005
006// http://www.bmsi.com/java/DiffPrint.java could also be useful
007
008/*
009 * $Log: Diff.java,v $
010 * Revision 1.15  2013/04/01 16:27:31  stuart
011 * Fix DiffPrint unified format with test cases.
012 * Begin porting some diff-2.7 features.
013 *
014 * Revision 1.14  2010/03/03 21:21:25  stuart
015 * Test new direct equivalence API
016 *
017 * Revision 1.13  2009/12/07 17:43:17  stuart
018 * Compute equivMax for int[] ctor
019 *
020 * Revision 1.12  2009/12/07 17:34:46  stuart
021 * Ctor with int[].
022 *
023 * Revision 1.11  2009/11/15 01:11:54  stuart
024 * Diff doesn't need to be generic
025 *
026 * Revision 1.10  2009/11/15 00:54:03  stuart
027 * Update to Java 5 containers
028 *
029 * Revision 1.7  2009/01/19 03:05:26  stuart
030 * Fix StackOverflow bug with heuristic on reported by Jimmy Han.
031 *
032 * Revision 1.6  2003/03/06 22:51:32  stuart
033 * Convert to CVS
034 *
035 * Revision 1.5  2002/07/19  19:14:40  stuart
036 * fix reverseScript, make change ctor public, update docs
037 *
038 * Revision 1.4  2002/04/09  17:53:39  stuart
039 * More flexible interface for diff() function.
040 *
041 * Revision 1.3  2000/03/03 21:58:03  stuart
042 * move discard_confusing_lines and shift_boundaries to class file_data
043 *
044 * Revision 1.2  2000/03/02  16:37:38  stuart
045 * Add GPL and copyright
046 *
047 */
048
049import java.util.HashMap;
050import java.util.Map;
051
052/** A class to compare vectors of objects.  The result of comparison
053    is a list of <code>change</code> objects which form an
054    edit script.  The objects compared are traditionally lines
055    of text from two files.  Comparison options such as "ignore
056    whitespace" are implemented by modifying the <code>equals</code>
057    and <code>hashcode</code> methods for the objects compared.
058<p>
059   The basic algorithm is described in: <br>
060   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
061   Algorithmica Vol. 1 No. 2, 1986, p 251.
062<p>
063   This class outputs different results from GNU diff 1.15 on some
064   inputs.  Our results are actually better (smaller change list, smaller
065   total size of changes), but it would be nice to know why.  Perhaps
066   there is a memory overwrite bug in GNU diff 1.15.
067
068  @author Stuart D. Gathman, translated from GNU diff 1.15
069    Copyright (C) 2000  Business Management Systems, Inc.
070<p>
071    This program is free software; you can redistribute it and/or modify
072    it under the terms of the GNU General Public License as published by
073    the Free Software Foundation; either version 1, or (at your option)
074    any later version.
075<p>
076    This program is distributed in the hope that it will be useful,
077    but WITHOUT ANY WARRANTY; without even the implied warranty of
078    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
079    GNU General Public License for more details.
080<p>
081    You should have received a copy of the <a href=COPYING.txt>
082    GNU General Public License</a>
083    along with this program; if not, write to the Free Software
084    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
085 */
086public class Diff {
087
088    /** Prepare to find differences between two arrays.  Each element of
089      the arrays is translated to an "equivalence number" based on
090      the result of <code>equals</code>.  The original Object arrays
091      are no longer needed for computing the differences.  They will
092      be needed again later to print the results of the comparison as
093      an edit script, if desired.
094     * @param a first array
095     * @param b second array
096     */
097    public Diff(Object[] a, Object[] b) {
098        Map<Object, Integer> h = new HashMap<>(a.length + b.length);
099        filevec = new FileData[] {new FileData(a, h), new FileData(b, h)};
100    }
101
102    /** 1 more than the maximum equivalence value used for this or its
103     sibling file. */
104    private int equivMax = 1;
105
106    /** When set to true, the comparison uses a heuristic to speed it up.
107    With this heuristic, for files with a constant small density
108    of changes, the algorithm is linear in the file size.  */
109    public boolean heuristic;
110
111    /** When set to true, the algorithm returns a guarranteed minimal
112      set of changes.  This makes things slower, sometimes much slower. */
113    public boolean noDiscards;
114
115    private int[] xvec, yvec; /* Vectors being compared. */
116    private int[] fdiag;      /* Vector, indexed by diagonal, containing
117                   the X coordinate of the point furthest
118                   along the given diagonal in the forward
119                   search of the edit matrix. */
120    private int[] bdiag;      /* Vector, indexed by diagonal, containing
121                   the X coordinate of the point furthest
122                   along the given diagonal in the backward
123                   search of the edit matrix. */
124    private int fdiagoff, bdiagoff;
125    private final FileData[] filevec;
126    private int cost;
127    /** Snakes bigger than this are considered "big". */
128    private static final int SNAKE_LIMIT = 20;
129
130    /**
131     * Find the midpoint of the shortest edit script for a specified
132     * portion of the two files.
133     *
134     * We scan from the beginnings of the files, and simultaneously from the ends,
135     * doing a breadth-first search through the space of edit-sequence.
136     * When the two searches meet, we have found the midpoint of the shortest
137     * edit sequence.
138     *
139     * The value returned is the number of the diagonal on which the midpoint lies.
140     * The diagonal number equals the number of inserted lines minus the number
141     * of deleted lines (counting only lines before the midpoint).
142     * The edit cost is stored into COST; this is the total number of
143     * lines inserted or deleted (counting only lines before the midpoint).
144     *
145     * This function assumes that the first lines of the specified portions
146     * of the two files do not match, and likewise that the last lines do not
147     * match.  The caller must trim matching lines from the beginning and end
148     * of the portions it is going to specify.
149     *
150     * Note that if we return the "wrong" diagonal value, or if
151     * the value of bdiag at that diagonal is "wrong",
152     * the worst this can do is cause suboptimal diff output.
153     * It cannot cause incorrect diff output.
154     */
155    private int diag(int xoff, int xlim, int yoff, int ylim) {
156        final int[] fd = fdiag; // Give the compiler a chance.
157        final int[] bd = bdiag; // Additional help for the compiler.
158        final int[] xv = xvec;      // Still more help for the compiler.
159        final int[] yv = yvec;      // And more and more . . .
160        final int dmin = xoff - ylim;   // Minimum valid diagonal.
161        final int dmax = xlim - yoff;   // Maximum valid diagonal.
162        final int fmid = xoff - yoff;   // Center diagonal of top-down search.
163        final int bmid = xlim - ylim;   // Center diagonal of bottom-up search.
164        int fmin = fmid, fmax = fmid;   // Limits of top-down search.
165        int bmin = bmid, bmax = bmid;   // Limits of bottom-up search.
166        /* True if southeast corner is on an odd
167                     diagonal with respect to the northwest. */
168        final boolean odd = (fmid - bmid & 1) != 0;
169
170        fd[fdiagoff + fmid] = xoff;
171        bd[bdiagoff + bmid] = xlim;
172
173        for (int c = 1;; ++c) {
174            int d;          /* Active diagonal. */
175            boolean big_snake = false;
176
177            /* Extend the top-down search by an edit step in each diagonal. */
178            if (fmin > dmin) {
179                fd[fdiagoff + --fmin - 1] = -1;
180            } else {
181                ++fmin;
182            }
183            if (fmax < dmax) {
184                fd[fdiagoff + ++fmax + 1] = -1;
185            } else {
186                --fmax;
187            }
188            for (d = fmax; d >= fmin; d -= 2) {
189                int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
190
191                if (tlo >= thi) {
192                    x = tlo + 1;
193                } else {
194                    x = thi;
195                }
196                oldx = x;
197                y = x - d;
198                while (x < xlim && y < ylim && xv[x] == yv[y]) {
199                    ++x; ++y;
200                }
201                if (x - oldx > SNAKE_LIMIT) {
202                    big_snake = true;
203                }
204                fd[fdiagoff + d] = x;
205                if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
206                    cost = 2 * c - 1;
207                    return d;
208                }
209            }
210
211            /* Similar extend the bottom-up search. */
212            if (bmin > dmin) {
213                bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
214            } else {
215                ++bmin;
216            }
217            if (bmax < dmax) {
218                bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
219            } else {
220                --bmax;
221            }
222            for (d = bmax; d >= bmin; d -= 2) {
223                int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
224
225                if (tlo < thi) {
226                    x = tlo;
227                } else {
228                    x = thi - 1;
229                }
230                oldx = x;
231                y = x - d;
232                while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
233                    --x; --y;
234                }
235                if (oldx - x > SNAKE_LIMIT) {
236                    big_snake = true;
237                }
238                bd[bdiagoff + d] = x;
239                if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
240                    cost = 2 * c;
241                    return d;
242                }
243            }
244
245            /* Heuristic: check occasionally for a diagonal that has made
246       lots of progress compared with the edit distance.
247       If we have any such, find the one that has made the most
248       progress and return it as if it had succeeded.
249
250       With this heuristic, for files with a constant small density
251       of changes, the algorithm is linear in the file size.  */
252
253            if (c > 200 && big_snake && heuristic) {
254                int best = 0;
255                int bestpos = -1;
256
257                for (d = fmax; d >= fmin; d -= 2) {
258                    int dd = d - fmid;
259                    int x = fd[fdiagoff + d];
260                    int y = x - d;
261                    int v = (x - xoff) * 2 - dd;
262                    if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
263                        if (v > best
264                                && xoff + SNAKE_LIMIT <= x && x < xlim
265                                && yoff + SNAKE_LIMIT <= y && y < ylim) {
266                            /* We have a good enough best diagonal.
267                               now insist that it end with a significant snake.  */
268                            int k;
269
270                            for (k = 1; xvec[x - k] == yvec[y - k]; k++) {
271                                if (k == SNAKE_LIMIT) {
272                                    best = v;
273                                    bestpos = d;
274                                    break;
275                                }
276                            }
277                        }
278                    }
279                }
280                if (best > 0) {
281                    cost = 2 * c - 1;
282                    return bestpos;
283                }
284
285                best = 0;
286                for (d = bmax; d >= bmin; d -= 2) {
287                    int dd = d - bmid;
288                    int x = bd[bdiagoff + d];
289                    int y = x - d;
290                    int v = (xlim - x) * 2 + dd;
291                    if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
292                        if (v > best
293                                && xoff < x && x <= xlim - SNAKE_LIMIT
294                                && yoff < y && y <= ylim - SNAKE_LIMIT) {
295                            /* We have a good enough best diagonal.
296                               now insist that it end with a significant snake.  */
297                            int k;
298
299                            for (k = 0; xvec[x + k] == yvec[y + k]; k++) {
300                                if (k == SNAKE_LIMIT) {
301                                    best = v;
302                                    bestpos = d;
303                                    break;
304                                }
305                            }
306                        }
307                    }
308                }
309                if (best > 0) {
310                    cost = 2 * c - 1;
311                    return bestpos;
312                }
313            }
314        }
315    }
316
317    /** Compare in detail contiguous subsequences of the two files
318     which are known, as a whole, to match each other.
319
320     The results are recorded in the vectors filevec[N].changed_flag, by
321     storing a 1 in the element for each line that is an insertion or deletion.
322
323     The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
324
325     Note that XLIM, YLIM are exclusive bounds.
326     All line numbers are origin-0 and discarded lines are not counted.  */
327
328    private void compareseq(int xoff, int xlim, int yoff, int ylim) {
329        /* Slide down the bottom initial diagonal. */
330        while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
331            ++xoff; ++yoff;
332        }
333        /* Slide up the top initial diagonal. */
334        while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
335            --xlim; --ylim;
336        }
337
338        /* Handle simple cases. */
339        if (xoff == xlim) {
340            while (yoff < ylim) {
341                filevec[1].changedFlag[1+filevec[1].realindexes[yoff++]] = true;
342            }
343        } else if (yoff == ylim) {
344            while (xoff < xlim) {
345                filevec[0].changedFlag[1+filevec[0].realindexes[xoff++]] = true;
346            }
347        } else {
348            /* Find a point of correspondence in the middle of the files.  */
349
350            int d = diag(xoff, xlim, yoff, ylim);
351            int c = cost;
352            int b = bdiag[bdiagoff + d];
353
354            if (c == 1)
355                /* This should be impossible, because it implies that
356                   one of the two subsequences is empty,
357                   and that case was handled above without calling `diag'.
358                   Let's verify that this is true.  */
359                throw new IllegalArgumentException("Empty subsequence");
360            else {
361                /* Use that point to split this problem into two subproblems.  */
362                compareseq(xoff, b, yoff, b - d);
363                /* This used to use f instead of b,
364                   but that is incorrect!
365                   It is not necessarily the case that diagonal d
366                   has a snake from b to f.  */
367                compareseq(b, xlim, b - d, ylim);
368            }
369        }
370    }
371
372    /** Discard lines from one file that have no matches in the other file.
373     */
374    private void discard_confusing_lines() {
375        filevec[0].discard_confusing_lines(filevec[1]);
376        filevec[1].discard_confusing_lines(filevec[0]);
377    }
378
379    private boolean inhibit;
380
381    /** Adjust inserts/deletes of blank lines to join changes
382        as much as possible.
383     */
384    private void shift_boundaries() {
385        if (inhibit)
386            return;
387        filevec[0].shift_boundaries(filevec[1]);
388        filevec[1].shift_boundaries(filevec[0]);
389    }
390
391    public interface ScriptBuilder {
392        /** Scan the tables of which lines are inserted and deleted,
393            producing an edit script.
394            @param changed0 true for lines in first file which do not match 2nd
395            @param len0 number of lines in first file
396            @param changed1 true for lines in 2nd file which do not match 1st
397            @param len1 number of lines in 2nd file
398            @return a linked list of changes - or null
399         */
400        Change build_script(
401                boolean[] changed0, int len0,
402                boolean[] changed1, int len1
403        );
404    }
405
406    /** Scan the tables of which lines are inserted and deleted,
407     producing an edit script in reverse order.  */
408
409    static class ReverseScript implements ScriptBuilder {
410        @Override
411        public  Change build_script(
412                final boolean[] changed0, int len0,
413                final boolean[] changed1, int len1) {
414            Change script = null;
415            int i0 = 0, i1 = 0;
416            while (i0 < len0 || i1 < len1) {
417                if (changed0[1+i0] || changed1[1+i1]) {
418                    int line0 = i0, line1 = i1;
419
420                    /* Find # lines changed here in each file.  */
421                    while (changed0[1+i0]) {
422                        ++i0;
423                    }
424                    while (changed1[1+i1]) {
425                        ++i1;
426                    }
427
428                    /* Record this change.  */
429                    script = new Change(line0, line1, i0 - line0, i1 - line1, script);
430                }
431
432                /* We have reached lines in the two files that match each other.  */
433                i0++; i1++;
434            }
435
436            return script;
437        }
438    }
439
440    static class ForwardScript implements ScriptBuilder {
441        /** Scan the tables of which lines are inserted and deleted,
442            producing an edit script in forward order.  */
443        @Override
444        public Change build_script(
445                final boolean[] changed0, int len0,
446                final boolean[] changed1, int len1) {
447            Change script = null;
448            int i0 = len0, i1 = len1;
449
450            while (i0 >= 0 || i1 >= 0) {
451                if (changed0[i0] || changed1[i1]) {
452                    int line0 = i0, line1 = i1;
453
454                    /* Find # lines changed here in each file.  */
455                    while (changed0[i0]) {
456                        --i0;
457                    }
458                    while (changed1[i1]) {
459                        --i1;
460                    }
461
462                    /* Record this change.  */
463                    script = new Change(i0, i1, line0 - i0, line1 - i1, script);
464                }
465
466                /* We have reached lines in the two files that match each other.  */
467                i0--; i1--;
468            }
469
470            return script;
471        }
472    }
473
474    /** Standard ScriptBuilders. */
475    public static final ScriptBuilder
476    forwardScript = new ForwardScript(),
477    reverseScript = new ReverseScript();
478
479    /** Report the differences of two files. DEPTH is the current directory depth. */
480    public final Change diff_2(final boolean reverse) {
481        return diff(reverse ? reverseScript : forwardScript);
482    }
483
484    /** Get the results of comparison as an edit script.  The script
485     is described by a list of changes.  The standard ScriptBuilder
486     implementations provide for forward and reverse edit scripts.
487     Alternate implementations could, for instance, list common elements
488     instead of differences.
489     @param bld an object to build the script from change flags
490     @return the head of a list of changes
491     */
492    public Change diff(final ScriptBuilder bld) {
493
494        /* Some lines are obviously insertions or deletions
495       because they don't match anything.  Detect them now,
496       and avoid even thinking about them in the main comparison algorithm.  */
497
498        discard_confusing_lines();
499
500        /* Now do the main comparison algorithm, considering just the
501       undiscarded lines.  */
502
503        xvec = filevec[0].undiscarded;
504        yvec = filevec[1].undiscarded;
505
506        int diags =
507            filevec[0].nondiscardedLines + filevec[1].nondiscardedLines + 3;
508        fdiag = new int[diags];
509        fdiagoff = filevec[1].nondiscardedLines + 1;
510        bdiag = new int[diags];
511        bdiagoff = filevec[1].nondiscardedLines + 1;
512
513        compareseq(0, filevec[0].nondiscardedLines,
514                0, filevec[1].nondiscardedLines);
515        fdiag = null;
516        bdiag = null;
517
518        /* Modify the results slightly to make them prettier
519       in cases where that can validly be done.  */
520
521        shift_boundaries();
522
523        /* Get the results of comparison in the form of a chain
524       of `struct change's -- an edit script.  */
525        return bld.build_script(
526                filevec[0].changedFlag,
527                filevec[0].bufferedLines,
528                filevec[1].changedFlag,
529                filevec[1].bufferedLines
530        );
531
532    }
533
534    /** The result of comparison is an "edit script": a chain of change objects.
535     Each change represents one place where some lines are deleted
536     and some are inserted.
537
538     LINE0 and LINE1 are the first affected lines in the two files (origin 0).
539     DELETED is the number of lines deleted here from file 0.
540     INSERTED is the number of lines inserted here in file 1.
541
542     If DELETED is 0 then LINE0 is the number of the line before
543     which the insertion was done; vice versa for INSERTED and LINE1.  */
544
545    public static class Change {
546        /** Previous or next edit command. */
547        public Change link;
548        /** # lines of file 1 changed here.  */
549        public final int inserted;
550        /** # lines of file 0 changed here.  */
551        public final int deleted;
552        /** Line number of 1st deleted line.  */
553        public final int line0;
554        /** Line number of 1st inserted line.  */
555        public final int line1;
556
557        /** Cons an additional entry onto the front of an edit script OLD.
558       LINE0 and LINE1 are the first affected lines in the two files (origin 0).
559       DELETED is the number of lines deleted here from file 0.
560       INSERTED is the number of lines inserted here in file 1.
561
562       If DELETED is 0 then LINE0 is the number of the line before
563       which the insertion was done; vice versa for INSERTED and LINE1.  */
564        public Change(int line0, int line1, int deleted, int inserted, Change old) {
565            this.line0 = line0;
566            this.line1 = line1;
567            this.inserted = inserted;
568            this.deleted = deleted;
569            this.link = old;
570        }
571
572        /**
573         * Returns the number of insertions and deletions of this change as well as
574         * (recursively) the changes linked via {@link #link}.
575         */
576        public int getTotalNumberOfChanges() {
577            return inserted + deleted + (link != null ? link.getTotalNumberOfChanges() : 0);
578        }
579
580        @Override
581        public String toString() {
582            String s = String.format("%d -%d +%d %d", line0, deleted, inserted, line1);
583            return (link != null) ? s = s + '\n' + link : s;
584        }
585    }
586
587    /** Data on one input file being compared.
588     */
589    class FileData {
590
591        /** Allocate changed array for the results of comparison.  */
592        void clear() {
593            /* Allocate a flag for each line of each file, saying whether that line
594               is an insertion or deletion.
595               Allocate an extra element, always zero, at each end of each vector.
596             */
597            changedFlag = new boolean[bufferedLines + 2];
598        }
599
600        /** Return equiv_count[I] as the number of lines in this file
601         that fall in equivalence class I.
602         @return the array of equivalence class counts.
603         */
604        int[] equivCount() {
605            int[] equiv_count = new int[equivMax];
606            for (int i = 0; i < bufferedLines; ++i) {
607                ++equiv_count[equivs[i]];
608            }
609            return equiv_count;
610        }
611
612        /** Discard lines that have no matches in another file.
613
614       A line which is discarded will not be considered by the actual
615       comparison algorithm; it will be as if that line were not in the file.
616       The file's `realindexes' table maps virtual line numbers
617       (which don't count the discarded lines) into real line numbers;
618       this is how the actual comparison algorithm produces results
619       that are comprehensible when the discarded lines are counted.
620<p>
621       When we discard a line, we also mark it as a deletion or insertion
622       so that it will be printed in the output.
623      @param f the other file
624         */
625        void discard_confusing_lines(FileData f) {
626            clear();
627            /* Set up table of which lines are going to be discarded. */
628            final byte[] discarded = discardable(f.equivCount());
629
630            /* Don't really discard the provisional lines except when they occur
631       in a run of discardables, with nonprovisionals at the beginning
632       and end.  */
633            filterDiscards(discarded);
634
635            /* Actually discard the lines. */
636            discard(discarded);
637        }
638
639        /**
640         * Mark to be discarded each line that matches no line of another file.
641         * If a line matches many lines, mark it as provisionally discardable.
642         * @param counts The count of each equivalence number for the other file.
643         * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable for each line
644         * @see #equivCount()
645         */
646        private byte[] discardable(final int[] counts) {
647            final int end = bufferedLines;
648            final byte[] discards = new byte[end];
649            final int[] equivs = this.equivs;
650            int many = 5;
651            int tem = end / 64;
652
653            /* Multiply MANY by approximate square root of number of lines.
654               That is the threshold for provisionally discardable lines.  */
655            while ((tem = tem >> 2) > 0) {
656                many *= 2;
657            }
658
659            for (int i = 0; i < end; i++) {
660                int nmatch;
661                if (equivs[i] == 0) {
662                    continue;
663                }
664                nmatch = counts[equivs[i]];
665                if (nmatch == 0) {
666                    discards[i] = 1;
667                } else if (nmatch > many) {
668                    discards[i] = 2;
669                }
670            }
671            return discards;
672        }
673
674        /**
675         * Don't really discard the provisional lines except when they occur
676         * in a run of discardables, with nonprovisionals at the beginning
677         * and end.
678         */
679        private void filterDiscards(final byte[] discards) {
680            final int end = bufferedLines;
681
682            for (int i = 0; i < end; i++) {
683                /* Cancel provisional discards not in middle of run of discards.  */
684                if (discards[i] == 2) {
685                    discards[i] = 0;
686                } else if (discards[i] != 0) {
687                    /* We have found a nonprovisional discard.  */
688                    int j;
689                    int length;
690                    int provisional = 0;
691
692                    /* Find end of this run of discardable lines.
693                       Count how many are provisionally discardable.  */
694                    for (j = i; j < end; j++) {
695                        if (discards[j] == 0) {
696                            break;
697                        }
698                        if (discards[j] == 2) {
699                            ++provisional;
700                        }
701                    }
702
703                    /* Cancel provisional discards at end, and shrink the run.  */
704                    while (j > i && discards[j - 1] == 2) {
705                        discards[--j] = 0; --provisional;
706                    }
707
708                    /* Now we have the length of a run of discardable lines
709                       whose first and last are not provisional.  */
710                    length = j - i;
711
712                    /* If 1/4 of the lines in the run are provisional,
713                       cancel discarding of all provisional lines in the run.  */
714                    if (provisional * 4 > length) {
715                        while (j > i)
716                            if (discards[--j] == 2) {
717                                discards[j] = 0;
718                            }
719                    } else {
720                        int consec;
721                        int minimum = 1;
722                        int tem = length / 4;
723
724                        /* MINIMUM is approximate square root of LENGTH/4.
725                           A subrun of two or more provisionals can stand
726                           when LENGTH is at least 16.
727                           A subrun of 4 or more can stand when LENGTH >= 64.  */
728                        while ((tem = tem >> 2) > 0) {
729                            minimum *= 2;
730                        }
731                        minimum++;
732
733                        /* Cancel any subrun of MINIMUM or more provisionals
734                           within the larger run.  */
735                        for (j = 0, consec = 0; j < length; j++) {
736                            if (discards[i + j] != 2) {
737                                consec = 0;
738                            } else if (minimum == ++consec) {
739                                /* Back up to start of subrun, to cancel it all.  */
740                                j -= consec;
741                            } else if (minimum < consec) {
742                                discards[i + j] = 0;
743                            }
744                        }
745
746                        /* Scan from beginning of run
747                           until we find 3 or more nonprovisionals in a row
748                           or until the first nonprovisional at least 8 lines in.
749                           Until that point, cancel any provisionals.  */
750                        for (j = 0, consec = 0; j < length; j++) {
751                            if (j >= 8 && discards[i + j] == 1) {
752                                break;
753                            }
754                            if (discards[i + j] == 2) {
755                                consec = 0; discards[i + j] = 0;
756                            } else if (discards[i + j] == 0) {
757                                consec = 0;
758                            } else {
759                                consec++;
760                            }
761                            if (consec == 3) {
762                                break;
763                            }
764                        }
765
766                        /* I advances to the last line of the run.  */
767                        i += length - 1;
768
769                        /* Same thing, from end.  */
770                        for (j = 0, consec = 0; j < length; j++) {
771                            if (j >= 8 && discards[i - j] == 1) {
772                                break;
773                            }
774                            if (discards[i - j] == 2) {
775                                consec = 0; discards[i - j] = 0;
776                            } else if (discards[i - j] == 0) {
777                                consec = 0;
778                            } else {
779                                consec++;
780                            }
781                            if (consec == 3) {
782                                break;
783                            }
784                        }
785                    }
786                }
787            }
788        }
789
790        /** Actually discard the lines.
791            @param discards flags lines to be discarded
792         */
793        private void discard(final byte[] discards) {
794            final int end = bufferedLines;
795            int j = 0;
796            for (int i = 0; i < end; ++i) {
797                if (noDiscards || discards[i] == 0) {
798                    undiscarded[j] = equivs[i];
799                    realindexes[j++] = i;
800                } else {
801                    changedFlag[1+i] = true;
802                }
803            }
804            nondiscardedLines = j;
805        }
806
807        FileData(int length) {
808            bufferedLines = length;
809            equivs = new int[length];
810            undiscarded = new int[bufferedLines];
811            realindexes = new int[bufferedLines];
812        }
813
814        FileData(Object[] data, Map<Object, Integer> h) {
815            this(data.length);
816            // FIXME: diff 2.7 removes common prefix and common suffix
817            for (int i = 0; i < data.length; ++i) {
818                Integer ir = h.get(data[i]);
819                if (ir == null) {
820                    h.put(data[i], equivs[i] = equivMax++);
821                } else {
822                    equivs[i] = ir.intValue();
823                }
824            }
825        }
826
827        /** Adjust inserts/deletes of blank lines to join changes
828       as much as possible.
829
830       We do something when a run of changed lines include a blank
831       line at one end and have an excluded blank line at the other.
832       We are free to choose which blank line is included.
833       `compareseq' always chooses the one at the beginning,
834       but usually it is cleaner to consider the following blank line
835       to be the "change".  The only exception is if the preceding blank line
836       would join this change to other changes.
837      @param f the file being compared against
838         */
839        void shift_boundaries(FileData f) {
840            final boolean[] changed = changedFlag;
841            final boolean[] other_changed = f.changedFlag;
842            int i = 0;
843            int j = 0;
844            int i_end = bufferedLines;
845            int preceding = -1;
846            int other_preceding = -1;
847
848            for (;;) {
849                int start, end, other_start;
850
851                /* Scan forwards to find beginning of another run of changes.
852                   Also keep track of the corresponding point in the other file.  */
853
854                while (i < i_end && !changed[1+i]) {
855                    while (other_changed[1+j++]) {
856                        /* Non-corresponding lines in the other file
857                           will count as the preceding batch of changes.  */
858                        other_preceding = j;
859                    }
860                    i++;
861                }
862
863                if (i == i_end) {
864                    break;
865                }
866
867                start = i;
868                other_start = j;
869
870                for (;;) {
871                    /* Now find the end of this run of changes.  */
872
873                    while (i < i_end && changed[1+i]) {
874                        i++;
875                    }
876                    end = i;
877
878                    /* If the first changed line matches the following unchanged one,
879                       and this run does not follow right after a previous run,
880                       and there are no lines deleted from the other file here,
881                       then classify the first changed line as unchanged
882                       and the following line as changed in its place.  */
883
884                    /* You might ask, how could this run follow right after another?
885                       Only because the previous run was shifted here.  */
886
887                    if (end != i_end && equivs[start] == equivs[end] && !other_changed[1+j]
888                         && !((preceding >= 0 && start == preceding) || (other_preceding >= 0 && other_start == other_preceding))) {
889                        changed[1+end++] = true;
890                        changed[1+start++] = false;
891                        ++i;
892                        /* Since one line-that-matches is now before this run
893                           instead of after, we must advance in the other file
894                           to keep in synch.  */
895                        ++j;
896                    } else {
897                        break;
898                    }
899                }
900
901                preceding = i;
902                other_preceding = j;
903            }
904        }
905
906        /** Number of elements (lines) in this file. */
907        private final int bufferedLines;
908
909        /** Vector, indexed by line number, containing an equivalence code for
910           each line.  It is this vector that is actually compared with that
911           of another file to generate differences. */
912        private final int[]     equivs;
913
914        /** Vector, like the previous one except that
915           the elements for discarded lines have been squeezed out.  */
916        private final int[]    undiscarded;
917
918        /** Vector mapping virtual line numbers (not counting discarded lines)
919           to real ones (counting those lines).  Both are origin-0.  */
920        private final int[]    realindexes;
921
922        /** Total number of nondiscarded lines. */
923        private int         nondiscardedLines;
924
925        /** Array, indexed by real origin-1 line number,
926           containing true for a line that is an insertion or a deletion.
927           The results of comparison are stored here.  */
928        private boolean[]       changedFlag;
929    }
930}